From: Piotr Mardziel Subject: [cs539] stratified data split Date: Thu, 3 Feb 2005 22:56:13 -0500 To: cs539-all --Apple-Mail-6-333155060 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset=US-ASCII; format=flowed Hellos, the previous script I emailed out did not do any kind of "stratification". Here is one that does: --Apple-Mail-6-333155060 Content-Transfer-Encoding: 7bit Content-Type: application/octet-stream; x-unix-mode=0644; name="strat.pl" Content-Disposition: attachment; filename=strat.pl my $HEAD = <<'FORMAT' +-------------------+----------+----------+---------+ | @|||||||||||||||| | @||||||| | @||||||| | @|||||| | +-------------------+----------+----------+---------+ FORMAT ; my $ROW = <<'FORMAT' | @>>>>>>>>>>>>>>>> | @<<<<<<< | @>>>>>>> | @>>>>>> | FORMAT ; my $SEP= <<'FORMAT' +-------------------+----------+----------+---------+ FORMAT ; my ($source, $target1, $target2, $split) = @ARGV; my $i = (defined $ARGV[4] ? $ARGV[4] : -1); open(in, $source) or die "can't open $source"; open(out1, '>'. $target1) or die "can't open $target1";; open(out2, '>'. $target2) or die "can't open $target2";;; my ($total) = `wc -l $source` =~ m/([0-9]+)/; if (int($i) ne $i) { die "attribute index must be an integers"; } if (($split < 0) || ($split > 1)) { die "split must be between 0 and 1"; } my $c, $c1, $c2; my %count, %dist, %sent; foreach my $line () { $c++; consider($line); if (not $c & 127) { print "[" . "=" x int(20 * $c / $total) . " " x int(20 - 20 * $c / $total) . "] "; print sprintf("%.3f% $c/$total\r", 100 * $c / $total); } }; close(in); close(out1); close(out2); swrite($HEAD, "file", "value", "count", "percent"); swrite($ROW, $source, "*", $c, 1); swrite($ROW, $target1, "*", $c1, $c1 / $c); swrite($ROW, $target2, "*", $c2, $c2 / $c); swrite($SEP); foreach my $val (sort keys %count) { $dist{$val} = $count{$val} / $c; swrite($ROW, $source, $val, $count{$val}, $dist{$val}); swrite($ROW, $target1, $val, $sent{$val}, $sent{$val} / ($c1 != 0 ? $c1 : 1)); swrite($ROW, $target2, $val, ($count{$val} - $sent{$val}), ($count{$val} - $sent{$val}) / ($c2 != 0 ? $c2 : 1)); swrite($SEP); } exit(0); sub consider { my ($line) = @_; @line = split(/,/, $line); chomp $line[-1]; my $val = $line[$i]; $count{$val}++; $dist{$val} = $count{$val} / $c; my $left = $total - $c; my $cleft = $dist{$val} * $left; my $need = $total * $split * $dist{$val} - $sent{$val}; my $prob = 0; if ($cleft != 0) { $prob = $need / $cleft; } if (rand() < $prob) { $c1++; print out1 $line; $sent{$val}++; } else { $c2++; print out2 $line; } } sub swrite { my $format = shift(@_); $^A = ""; my @temp = @_; if ($format eq $ROW) { $temp[3] = sprintf("%.3f", 100 * $temp[3]); } formline($format, @temp); print $^A; } --Apple-Mail-6-333155060 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset=US-ASCII; format=flowed Run with : perl strat.pl covtype.data blah1 blah2 .3333 -1 Where covtype.data is the source data file, blah1 and blah2 will be the two new files, .3333 is the ratio of the source data to send to the blah1 and -1 is the attribute index of the attribute to use for stratification. 0 is the first, 1 is second etc ... while -1 is last, -2 is second to last, etc... Source data must be one instance per line with attributes separated by a comma. Hope this helps someone with the next projects. -piotr --Apple-Mail-6-333155060--