User:ImplSearchBot/Code: Difference between revisions

From Rosetta Code
Content added Content deleted
m (ImplSearchBot:0)
m (ImplSearchBot:0)
 
(11 intermediate revisions by the same user not shown)
Line 4: Line 4:
use MediaWiki::Bot;
use MediaWiki::Bot;
use JSON qw/to_json from_json/;
use JSON qw/to_json from_json/;

our $editor;
our $starttime = time;
our $pagesedited = 0;
our $categorypulls = 0;
our $cachehits = 0;
our $task_count_change;
our $alltasks;
our %impldiff;
our %createdomitcategories;
our @initialcache;
our $alllanguages;
our @logoutput;


my %options;
my %options;


#---------------------
#I don't care to pollute my global scope.
# These are all the functions that break up our work
# into logical chunks.
#---------------------

# Called in a *lot* of places to check the machine for overload.
sub slumber
{
{
# Snooze.
my $wiki = 'rosettacode.org';
while(&snooze) {};
my $username;
}
my $password;
my $verbosity = 2; # verbosity level. 0 is silent. 1 is error only. 2 is updates. 3 is process, more is noisy.
my $post; # Is this an actual run?
my $cachepath = "cache/";
my $cacheonly; # Don't query the wiki for data. Just pull from cache.
my $nosvn;
my $rebuild_all;


# A reversed snooze buton
my $opt_matrix = {
sub snooze
"wiki=s" => \$wiki,
{
"username=s" => \$username,
open my $loadavg, '<', '/proc/loadavg'
"password=s" => \$password,
or die "Failed to check load average: $!";
"verbosity=s" => \$verbosity,
my $loadstr = <$loadavg>;
"post" => \$post,
close $loadavg;
"cacheonly" => \$cacheonly,
"nosvn" => \$nosvn,
"cachepath=s" => \$cachepath,
"rebuildall" => \$rebuild_all };


# Wait one minute for every process in the wait queue.
my $result = GetOptions( %$opt_matrix );
my @avgs = split / +/, $loadstr, 3;
$options{'wiki'} = $wiki;


my $sleeptime = 0;
$options{'nosvn'} = $nosvn
if defined $nosvn;


# Sleep at least one minute for each process in the wait queue.
$options{'username'} = $username
$sleeptime = $avgs[0] * 60;
if defined $username;


$options{'password'} = $password
if defined $password;


# If we're currently under notable load
$options{'post'} = $post
if($sleeptime > 30)
if defined $post;

$options{'verbosity'} = $verbosity;

$cachepath .= '/'
if('/' ne substr($cachepath, -1, 1));
$options{'cachepath'} = $cachepath;

$options{'cacheonly'} = $cacheonly
if defined $cacheonly;

$options{'rebuild_all'} = $rebuild_all
if defined $rebuild_all;

my $usage = "Usage: $0 (options)\n The available options are:\n";
$usage .= "\t--$_\n"
foreach (keys %$opt_matrix);

$usage .= "Username and password are required if you need to pull data from the wiki. Wiki defaults to Rosetta Code.\n";


my $wikineeded;

$wikineeded = "yes"
unless ( exists $options{'cacheonly'} );

$wikineeded = "yes"
if ( exists $options{'post'} );

if(defined $wikineeded)
{
{
print "Calculated sleeptime: $sleeptime\n";
unless(exists $options{'username'} and exists $options{'password'})
print "Load averages: " . join(' ', @avgs) . " ... Sleeping $sleeptime seconds\n";
{
die $usage;
sleep $sleeptime;
return $sleeptime;
}
}
}
}


return 0;


#Statistic tracking.
my $starttime = time;
my $pagesedited = 0;
my $categorypulls = 0;
my $cachehits = 0;

# Tracking for svn checkin at end.
# We *should* be the only ones writing to the cache path for now.
# Anyone else should have their own checkout of the data,
# and test runs are done with --cacheonly and --nosvn.
opendir(CACHEDIR, $options{'cachepath'})
or die "Unable to open cache directory";
my @initialcache = readdir(CACHEDIR);
closedir(CACHEDIR);

&out(scalar @initialcache . " categories initially cached\n", 4);

# Get our editor
my $editor = &geteditor();

# Get a complete listing of the tasks.
&out("Getting tasks\n", 3);
my $alltasks = &getcategory('Category:Programming Tasks');

# Get a complete listing of the languages.
&out("Getting the languages.\n", 3);
my $alllanguages= &getcategory('Category:Programming Languages');

# Quick check. Did we add or lose any tasks? If so, we've got to recalc *all*
# of the pages. :-/
my $task_count_change;
unless(exists $options{'rebuild_all'})
{
my $cacheddata = &getcacheddata('Category:Programming Tasks');
my ($added, $removed) = &diffcat($alltasks, $cacheddata);
$task_count_change = scalar @$added + scalar @$removed;
}
else
{
$task_count_change = 1; # It just needs to be nonzero...
}
}

# We want the language name, not the fully-qualified wiki name.
$_ =~ s/^Category:// foreach (@$alllanguages);

# Get a list of the languages for which we've already provided bodies for the related omit categories.
# Store it as a hash, so the lookup will be faster.
my $omitcatcontents = &getcategory('Category:Maintenance/OmitCategoriesCreated');
my %createdomitcategories = map {$_, 1} @$omitcatcontents;

&out("Identifying work to do\n", 3);

my %impldiff;

foreach my $lang (@$alllanguages)
{
my $val = &getlangwork($lang);
next unless defined $val;
$impldiff{$lang} = $val;
}

&processimplediff(\%impldiff);


my $runtime = time() - $starttime;
my $statsdata = "Pages edited last run: $pagesedited<br/>Time to post all per-language updates: $runtime seconds<br/>Category pulls: $categorypulls<br/>Cache hits: $cachehits<br/>";
&out("Updating stats page. Runtime ($runtime), Pages edited ($pagesedited)\n", 2);

&postpage("User:ImplSearchBot/Stats", $statsdata, "Updating stats data", 0);


&out("Updating bot code page\n", 4);

open my $sourcefile, '<', $0
or die "Finished without updating bot source page";

my $botsource;
$botsource .= $_ while <$sourcefile>;

close $sourcefile;

my $tag = "lang";

&postpage("User:ImplSearchBot/Code", "<$tag perl>$botsource</$tag>", 0);

&out("Updating cache\n", 4);
&commitcache();

&out("Done\n", 3);


# END OF PROGRAM EXECUTION
exit(0);

#---------------------
# These are all the functions that break up our work
# into logical chunks.
#---------------------


# Builds a simple hash ref associating a page name with body.
# Builds a simple hash ref associating a page name with body.
Line 189: Line 66:
sub build_posting
sub build_posting
{
{
&slumber;
my $name = shift;
my $body = shift;
my $name = shift;
my $body = shift;

return {$name => $body};
return {$name => $body};
}
}



# Prepares the template body for the unimplemented data.
# Prepares the template body for the unimplemented data.
sub prep_unimp_posting
sub prep_unimp_posting
{
{
&slumber;
my $impldiff = shift;
my $language = shift;
my $impldiff = shift;
my $unimplisting = "";
my $language = shift;
my $unimplisting = "";
foreach my $taskname (@$alltasks)
foreach my $taskname (@$alltasks)
{
{
# If it's a category task, the task name will be slightly different.
# If it's a category task, the task name will be slightly different.
my $baretaskname = $taskname;
$baretaskname =~ s/^Category://;
my $baretaskname = $taskname;
$baretaskname =~ s/^Category://;
my $implpage = $taskname;
$implpage =~ s/^Category:(.*)/$1\/$language/;
my $implpage = $taskname;
$implpage =~ s/^Category:(.*)/$1\/$language/;


# my $escapedImplPage = $implpage;
my $link;
if($taskname eq $baretaskname)
{
$link = "[[$taskname]]";
}
else
{
$link = "[[:$taskname|$baretaskname]]";
}
# Add the task to the unimplemented list, if it's unimplemented, and if it's not in the omit list.
unless(exists $impldiff->{$language}->{'impl'}->{$implpage} or exists $impldiff->{$language}->{'omit'}->{$implpage})
{
$unimplisting .= "* $link\n";
}

}


# $escapedImplPage =~ s/ /_/g;
return &build_posting("Template:" . &get_unimpl_template_name($language), $unimplisting);
my $link;
$link = "{{unimpl task link|$baretaskname|$language}}";
# $link = "[http://rosettacode.org/mw/index.php?action=edit&title=$escapedImplPage $implpage]";
# Add the task to the unimplemented list, if it's unimplemented, and if it's not in the omit list.
unless(exists $impldiff->{$language}->{'impl'}->{$implpage} or exists $impldiff->{$language}->{'omit'}->{$implpage})
{
$unimplisting .= "* $link\n";
}
}
return &build_posting("Template:" . &get_unimpl_template_name($language), $unimplisting);
}
}


Line 237: Line 113:
sub prep_omit_posting
sub prep_omit_posting
{
{
&slumber;
my $implediff = shift;
my $language = shift;
my $implediff = shift;
my $omitlisting = "";
my $language = shift;
my $omitlisting = "";
my $omittemplatename = &get_omit_template_name($language);
my $omittemplatename = &get_omit_template_name($language);

foreach my $taskname (@$alltasks)
foreach my $taskname (@$alltasks)
{
{
# We want the task name, not the fully-qualified wiki name.
# We want the task name, not the fully-qualified wiki name.
my $baretaskname = $taskname;
$baretaskname =~ s/^Category://;
my $baretaskname = $taskname;
$baretaskname =~ s/^Category://;
# Add the task to the omission list, if it's omitted.
# Add the task to the omission list, if it's omitted.
$omitlisting .= "* [[$baretaskname]]\n"
my $escapedTaskName = $baretaskname;
if(exists $impldiff{$language}->{'omit'}->{$taskname})
$escapedTaskName =~ s/ /_/g;
}
$omitlisting .= "* [http://rosettacode.org/mw/index.php?action=edit&title=$escapedTaskName $baretaskname]\n"

if(exists $impldiff{$language}->{'omit'}->{$taskname})
# Note that there's no data in the template.
}
$implediff->{$language}->{'omit'} = 0
if( "" eq $omitlisting );
# Note that there's no data in the template.

$implediff->{$language}->{'omit'} = 0
return &build_posting("Template:$omittemplatename", $omitlisting);
if( "" eq $omitlisting );
return &build_posting("Template:$omittemplatename", $omitlisting);
}
}


Line 262: Line 141:
sub prep_listing_posting
sub prep_listing_posting
{
{
&slumber;
my $impldiff = shift;
my $language = shift;
my $impldiff = shift;
my $language = shift;

my $targetcount = $impldiff->{$language}->{'target_count'};
my $unimpcount = $impldiff->{$language}->{'unimp_count'};
my $targetcount = $impldiff->{$language}->{'target_count'};
my $unimpcount = $impldiff->{$language}->{'unimp_count'};

# Prepare template fields
# Prepare template fields
my $langfield = "|$language";
my $unimpfield = "|$unimpcount";
my $langfield = "|$language";
my $tcfield = "|$targetcount";
my $unimpfield = "|$unimpcount";
my $tcfield = "|$targetcount";
my $impperccalc = 0;
my $impperccalc = 0;
$impperccalc = (($targetcount - $unimpcount) / $targetcount) * 100
$impperccalc = (($targetcount - $unimpcount) / $targetcount) * 100
unless ($targetcount == 0);
unless ($targetcount == 0);

my $imppercfield = sprintf "|%u", $impperccalc;
my $imppercfield = sprintf "|%u", $impperccalc;
# Prepare the listing page format.
# Prepare the listing page format.
my $pagedata = '{{unimpl_header' . $langfield . $unimpfield . $tcfield . $imppercfield . '}}';
my $pagedata = '{{unimpl_header' . $langfield . $unimpfield . $tcfield . $imppercfield . '}}';
$pagedata .= "{{" . &get_unimpl_template_name($language) . "$unimpfield}}";
$pagedata .= "{{" . &get_unimpl_template_name($language) . "$unimpfield}}";

if (0 != $impldiff->{$language}->{'omit'})
if (0 != $impldiff->{$language}->{'omit'})
{
{
$pagedata .= "{{omit_header" . "$langfield}}";
$pagedata .= "{{omit_header" . "$langfield}}";
$pagedata .= "{{" . &get_omit_template_name($language) ."}}";
$pagedata .= "{{" . &get_omit_template_name($language) ."}}";
}
}

$pagedata .= "{{unimpl_footer$langfield}}";
$pagedata .= "{{unimpl_footer$langfield}}";

return &build_posting(&get_listing_name($language), $pagedata);
return &build_posting(&get_listing_name($language), $pagedata);
}
}


Line 297: Line 177:
sub prep_omit_cat_posting
sub prep_omit_cat_posting
{
{
&slumber;
my $implediff = shift;
my $language = shift;
my $implediff = shift;
my $language = shift;
return &build_posting(&get_omit_cat_name, "{{omit_cat|$language}}");
return &build_posting(&get_omit_cat_name($language), "{{omit_cat|$language}}");
}
}


Line 305: Line 186:
sub get_listing_name
sub get_listing_name
{
{
&slumber;
my $language = shift;
return "Tasks not implemented in $language";
my $language = shift;
return "Tasks not implemented in $language";
}
}


Line 312: Line 194:
sub get_omit_template_name
sub get_omit_template_name
{
{
&slumber;
my $language = shift;
return "unimp_omit_body_$language";
my $language = shift;
return "unimp_omit_body_$language";
}
}


Line 319: Line 202:
sub get_unimpl_template_name
sub get_unimpl_template_name
{
{
&slumber;
my $language = shift;
return "unimp_body_$language";
my $language = shift;
return "unimp_body_$language";
}
}


Line 326: Line 210:
sub get_omit_cat_name
sub get_omit_cat_name
{
{
&slumber;
my $language = shift;
return "Category:$language/Omit";
my $language = shift;
return "Category:$language/Omit";
}
}


Line 333: Line 218:
sub process_language
sub process_language
{
{
&slumber;
my $implediff = shift;
my $language = shift;
my $implediff = shift;
my $language = shift;


my $unimpl = &prep_unimp_posting($implediff, $language);
my $omit = &prep_omit_posting($implediff, $language);
my $unimpl = &prep_unimp_posting($implediff, $language);
my $listing = &prep_listing_posting($implediff, $language);
my $omit = &prep_omit_posting($implediff, $language);
my $listing = &prep_listing_posting($implediff, $language);

my %langpostings = ( %$unimpl, %$omit, %$listing );
my %langpostings = ( %$unimpl, %$omit, %$listing );


unless ( exists $createdomitcategories{&get_omit_cat_name($language)} )
unless ( exists $createdomitcategories{&get_omit_cat_name($language)} )
{
{
my $omit_cat = &prep_omit_cat_posting($implediff, $language);
my $omit_cat = &prep_omit_cat_posting($implediff, $language);
%langpostings = ( %langpostings, %$omit_cat );
%langpostings = ( %langpostings, %$omit_cat );
}
}
return \%langpostings;
}


sub wikitxt_pathname {
return \%langpostings;
&slumber;
my $page_name = $_[0];
"test/" . sanitizenamefs($page_name) . ".wikitxt";
}
}


Line 356: Line 248:
sub processimplediff
sub processimplediff
{
{
&slumber;
my $implediff = shift;
my $implediff = shift;
my %work;
my %work;

# Prepare all our work.
# Prepare all our work.
foreach my $language (keys %impldiff)
foreach my $language (keys %$implediff)
{
{
&out("Preparing data for:$language\n", 4);
my $workitem = &process_language($implediff, $language);
# &out("Preparing data for:$language\n", 4);
%work = ( %work, %$workitem );
my $workitem = &process_language($implediff, $language);
%work = ( %work, %$workitem );
}
}

# Now that we've prepared all our work, commit it.
# Now that we've prepared all our work, commit it.
foreach my $pagename (keys %work)
foreach my $pagename (keys %work)
{
{
&postpage($pagename, $work{$pagename}, "Updating $pagename", 0);
&postpage($pagename, $work{$pagename}, "Updating $pagename", 0);
}
}
}
}


# Return the Mediawiki editor obect.
# Return the Mediawiki editor object.
sub geteditor
sub geteditor
{
{
&slumber;
# If we're not posting, and we're only drawing from cache
# If we're not posting, and we're only drawing from cache
# We don't actually need to pull from the wiki.
# We don't actually need to pull from the wiki.
if( exists $options{'cacheonly'} )
if( exists $options{'cacheonly'} )
{
{
return undef
return undef
unless exists $options{'post'};
unless exists $options{'post'};
}
}

# Handles interaction with the wiki.
# Handles interaction with the wiki.
# Note that I had to modify HTTP::Message to make it work
# HTTP::Message silently failed when presented by MW
# Note that I had to modify HTTP::Message to make it work
# HTTP::Message silently failed when presented by MW
# with an encoding type of "application/json" or some such.
# with an encoding type of "application/json" or some such.
&out("Creating editor\n", 3);
&out("Creating editor\n", 3);
my $editor = MediaWiki::Bot->new('ImpleSearchBot');
my $editor = MediaWiki::Bot->new('ImpleSearchBot');
$editor->{debug} = 1;
$editor->{debug} = 1;

# Tell the editor to edit Rosetta Code. I'm sure Wikipedia didn't like
# Tell the editor to edit Rosetta Code. I'm sure Wikipedia didn't like
# my initial attempts from before I added this line.
# my initial attempts from before I added this line.
&out("Trying to set wiki.\n",3);
&out("Trying to set wiki.\n",3);
$editor->set_wiki('rosettacode.org','w');
$editor->set_wiki('rosettacode.org','mw');

# If we're not posting, we don't need to log in if we're pulling from cache.
# If we're not posting, we don't need to log in if we're pulling from cache.
# Otherwise, attempt to log in.
&out("Trying to log in.\n", 3);
# Otherwise, attempt to log in.
&out("Trying to log in.\n", 3);
my $loginres = $editor->login($options{'username'}, $options{'password'});
my $loginres = $editor->login($options{'username'}, $options{'password'});

die "Unable to login: " . $loginres
die "Unable to login: " . $loginres
unless $loginres == 0;
unless $loginres == 0;
return $editor;
return $editor;
}
}


Line 413: Line 306:
sub out
sub out
{
{
&slumber;
my $string = shift;
my $loglevel = shift;
my $string = shift;
my $loglevel = shift;
chomp $string;


chomp $string;
push @logoutput, ($string);

if($options{'verbosity'} >= $loglevel)
if($options{'verbosity'} >= $loglevel)
{
{
if($loglevel == 1)
if($loglevel == 1)
{
{
warn $string; # use stderr.
warn $string; # use stderr.
}
}
else
else
{
{
print "$string\n";
print "$string\n";
}
}
}
}
}
}


Line 436: Line 332:
sub sanitizenamefs
sub sanitizenamefs
{
{
&slumber;
my $pagename = shift;
$pagename =~ tr/:\//__/;
my $pagename = shift;
return $pagename;
$pagename =~ tr/:\//__/;
return $pagename;
}
}


Line 444: Line 341:
sub diffcat_simple
sub diffcat_simple
{
{
&slumber;
my $first = shift;
my $second = shift;
my $first = shift;
my %firsthash = map { $_, 1 } @$first;
my $second = shift;
my %firsthash = map { $_, 1 } @$first;

my @new = ();
my @new = ();

foreach my $secondelement (@$second)
foreach my $secondelement (@$second)
{
{
push @new, $secondelement
unless exists $firsthash{$secondelement};
push @new, $secondelement
unless exists $firsthash{$secondelement};
}
}

return @new;
return @new;
}
}


Line 462: Line 360:
sub diffcat
sub diffcat
{
{
&slumber;
my $first = shift;
my $second = shift;
my $first = shift;
my @onlyinfirst = &diffcat_simple($second, $first);
my $second = shift;
my @onlyinsecond = &diffcat_simple($first, $second);
my @onlyinfirst = &diffcat_simple($second, $first);
return (\@onlyinfirst, \@onlyinsecond);
my @onlyinsecond = &diffcat_simple($first, $second);
return (\@onlyinfirst, \@onlyinsecond);
}

sub slurp_file {
&slumber;
my $pathname = $_[0];
open(my $stream, "<", $pathname) or do {warn"Can't open '$pathname': $!\n";
return};
my $ret;
while (1) {
my $len = read($stream, my $buf, 1024);
if (!defined($len)) { die "I/O error while reading '$pathname': $!"; }
elsif ($len == 0) { last }
else { $ret .= $buf }
}
$ret;
}
}


Line 472: Line 386:
sub postpage
sub postpage
{
{
&slumber;
my $pagename = shift;
my $pagedata = shift;
my $pagename = shift;
my $remark = shift;
my $pagedata = shift;
my $minoredit = shift;
my $remark = shift;
my $minoredit = shift;

++$pagesedited;

# MediaWiki won't let us create blank, empty pages.
unless( exists $options{'post'} )
# But since we don't want to query (or cache) to see if
{
# the page already exists, we'll just add an HTML
# save it to disk, and out of the way.
# non-breaking-space entity if the page is truly empty.
$pagename = "test/" . &sanitizenamefs($pagename);
$pagename .= ".wikitxt";
$pagedata = "&nbsp;"
if(0 == length $pagedata);

&out("Saving: $pagename\n", 2);
++$pagesedited;
unless( exists $options{'post'} )
{
$pagename = wikitxt_pathname($pagename);
# save it to disk, and out of the way.
&out("Saving: $pagename\n", 2);
open my $outfile, '>', $pagename
or &out("Failed to open $pagename: $!", 1);
return unless defined $outfile;
print $outfile $pagedata;
close $outfile;
}
else {
open my $outfile, '>', $pagename
&out("Posting $pagename\n", 2);
or &log("Failed to open $pagename: $!", 1);
$editor->edit($pagename, $pagedata, "ImplSearchBot:$remark", $minoredit)

or &out("Failed to post page: " . $editor->{'errstr'}, 1);
return unless defined $outfile;
}

print $outfile $pagedata;
close $outfile;
}
else
{
&out("Posting $pagename\n", 2);

$editor->edit($pagename, $pagedata, "ImplSearchBot:$remark", $minoredit)
or &out("Failed to post page: " . $editor->{'errstr'}, 1);
}
}
}

# This allows us to compare site state between now and when we
# This allows us to compare site state between now and when we
# last ran, so that we can learn to avoid doing unnecessary work.
# last ran, so that we can learn to avoid doing unnecessary work.
Line 513: Line 435:
sub cachedata
sub cachedata
{
{
&slumber;
my $dataname = shift;
my $dataname = shift;
my $data = shift;
my $data = shift;
Line 528: Line 451:
return;
return;
}
}

print $outfile to_json($data);
print $outfile to_json($data);
close $outfile;
close $outfile;
&out(scalar $data . " members cached to $filename.\n", 5);
&out(scalar @$data . " members cached to $filename.\n", 5);
}
}

# Return data we cached previously.
# Return data we cached previously.
sub getcacheddata
sub getcacheddata
{
{
&slumber;
my $dataname = shift;
my $dataname = shift;
my $filename = $options{'cachepath'} . &sanitizenamefs($dataname . ".json");
my $filename = $options{'cachepath'} . &sanitizenamefs($dataname . ".json");
Line 546: Line 470:
return [];
return [];
}
}

my $jsondata;
my $jsondata;
$jsondata .= $_ while <$infile>;
$jsondata .= $_ while <$infile>;
close $infile;
close $infile;

my $cacheddata = from_json($jsondata);
my $cacheddata = from_json($jsondata);
&out(scalar @$cacheddata . " cache members retrieved from $filename\n", 5);
&out(scalar @$cacheddata . " cache members retrieved from $filename\n", 5);

++$cachehits;
++$cachehits;
return $cacheddata;
return $cacheddata;
}
}

# Report the changes between two categories.
# Report the changes between two categories.
# More interesting than "x added, y removed"
# More interesting than "x added, y removed"
sub reportcatchanges
sub reportcatchanges
{
{
&slumber;
my $category = shift;
my $category = shift;
my $old = shift;
my $old = shift;
my $new = shift;
my $new = shift;

my ($removed, $added) = &diffcat($old, $new);
my ($removed, $added) = &diffcat($old, $new);

my $out = "";
my $out = "";

$out .= "Removed from $category:\n"
$out .= "Removed from $category:\n"
if( scalar @$removed > 0 );
if( scalar @$removed > 0 );
$out .= "$_\n"
$out .= "$_\n"
foreach (@$removed);
foreach (@$removed);

$out .= "Added to $category:\n"
$out .= "Added to $category:\n"
if( scalar @$added > 0 );
if( scalar @$added > 0 );
$out .= "$_\n"
$out .= "$_\n"
foreach (@$added);
foreach (@$added);

&out($out, 2)
&out($out, 2)
if("" ne $out);
if("" ne $out);
}
}

# Pull the category data, or cached data if we're not pulling from the wiki.
# Pull the category data, or cached data if we're not pulling from the wiki.
sub getcategory
sub getcategory
{
{
&slumber;
my $categoryname = shift;
my $categoryname = shift;
&out("Getting category contents for $categoryname...", 4);
&out("Getting category contents for $categoryname...", 4);

my $old;
my $old;
my $new;
my $new;

if( exists $options{'cacheonly'} )
if( exists $options{'cacheonly'} )
{
{
Line 605: Line 531:
++$categorypulls;
++$categorypulls;
}
}

&reportcatchanges($categoryname, $old, $new);
# &reportcatchanges($categoryname, $old, $new);

&cachedata("$categoryname", $new);
&cachedata("$categoryname", $new);

&out(scalar @$new . " members returned for $categoryname\n", 5);
&out(scalar @$new . " members returned for $categoryname\n", 5);
return $new;
return $new;
}
}

# Find if this category changed, report its contents if it has.
# Find if this category changed, report its contents if it has.
sub getwork
sub getwork
{
{
&slumber;
my $categoryname = shift;
my $categoryname = shift;
&out("Getting work for $categoryname.\n",4);
&out("Getting work for $categoryname.\n",4);
my $cacheddata = &getcacheddata($categoryname);
my $cacheddata = &getcacheddata($categoryname);

# If we're on a cache-only basis, we'll just say we have no old data,
# If we're on a cache-only basis, we'll just say we have no old data,
# and that our cached data is our new data.
# and that our cached data is our new data.
Line 635: Line 563:
$newdata = &getcategory($categoryname);
$newdata = &getcategory($categoryname);
}
}

my ($removed, $added) = &diffcat($olddata, $newdata);
my ($removed, $added) = &diffcat($olddata, $newdata);
my $impl_count_change = scalar @$removed + @$added;
my $impl_count_change = scalar @$removed + @$added;
Line 641: Line 569:
$work = $newdata
$work = $newdata
if(($impl_count_change + $task_count_change) > 0);
if(($impl_count_change + $task_count_change) > 0);
&out(scalar @$work . " items to process for $categoryname.\n", 3);
&out(scalar @$work . " items to process for $categoryname.\n", 3);

return @$work;
return @$work;
}
}

# If changes occurred, the info is more important than if they didn't.
# If changes occurred, the info is more important than if they didn't.
sub getloglevelfromdiff
sub getloglevelfromdiff
{
{
&slumber;
my $base = shift;
my $base = shift;
my $first = shift;
my $first = shift;
Line 657: Line 586:
return $base + 1;
return $base + 1;
}
}

# Find all the work items for a given language.
# Find all the work items for a given language.
sub getlangwork
sub getlangwork
{
{
&slumber;
my $language = shift;
my $language = shift;
&out("Getting lang work for $language\n", 4);
&out("Getting lang work for $language\n", 4);
Line 673: Line 603:
$omit{$workitem} = 1;
$omit{$workitem} = 1;
}
}

# No work to do?
# No work to do?
return undef
return undef
if( ( 0 == scalar keys %omit) && (0 == scalar keys %impl));
if( ( 0 == scalar keys %omit) && (0 == scalar keys %impl));
# Language metadata
# Language metadata
my $unimpcount = scalar @$alltasks - scalar keys %impl;
my $unimpcount = scalar @$alltasks - scalar keys %impl;
my $targetcount = (scalar @$alltasks - scalar keys %omit);
my $targetcount = (scalar @$alltasks - scalar keys %omit);


# work to do.
# work to do.
return {'impl' => \%impl,
return {'impl' => \%impl,
Line 689: Line 619:
'target_count' => $targetcount };
'target_count' => $targetcount };
}
}

# Commit the cache.
# Commit the cache.
sub commitcache
sub commitcache
{
{
&slumber;
# First, find out if we've added any files.
# First, find out if we've added any files.
my $cachepath = $options{'cachepath'};
my $cachepath = $options{'cachepath'};
opendir(CACHEDIR, $cachepath);
opendir(CACHEDIR, $cachepath);

my @current = readdir(CACHEDIR);
my @current = readdir(CACHEDIR);
close(CACHEDIR);
close(CACHEDIR);

# We need to run svn adds if we've created any new files.
# We need to run svn adds if we've created any new files.
# Maybe we'll use SVN::Client some day. Not right now.
# Maybe we'll use SVN::Client some day. Not right now.
my ($added, $removed) = &diffcat(\@current, \@initialcache);
my ($added, $removed) = &diffcat(\@current, \@initialcache);
&out("Detected " . scalar @$added . " new cache files and " . scalar @$removed . " removed\n",&getloglevelfromdiff(2, $added, $removed));
&out("Detected " . scalar @$added . " new cache files and " . scalar @$removed . " removed\n",&getloglevelfromdiff(2, $added, $removed));
if ((scalar @$added + scalar @$removed ) > 0)
if ((scalar @$added + scalar @$removed ) > 0)
{
{
Line 712: Line 643:
&svn('add', $cachepath . $cachefile);;
&svn('add', $cachepath . $cachefile);;
}
}

foreach my $cachefile (@$removed)
foreach my $cachefile (@$removed)
{
{
Line 718: Line 649:
}
}
}
}

&svn('ci', '--message="ImplSearchBot run"', $cachepath);
&svn('ci', '--message="ImplSearchBot run"', $cachepath);
&svn('update', $cachepath);
&svn('update', $cachepath);
}
}


# Wrap svn commands so we can log them.
# Wrap svn commands so we can log them.
sub svn
sub svn
{
{
&slumber;
return
return
if(exists $options{'nosvn'});
if(exists $options{'nosvn'});
my @args = @_;
my @args = @_;

my $string = "system 'svn'";
my $string = "system 'svn'";
$string .= ", '$_'"
$string .= ", '$_'"
foreach (@args);
foreach (@args);

$string .= "\n";
$string .= "\n";

&out($string, 3);
&out($string, 3);

system 'svn', @args
system 'svn', @args
if(exists $options{'post'});
if(exists $options{'post'});
}
}

sub getopt {
&slumber;
#I don't care to pollute my global scope.
my $wiki = 'rosettacode.org';
my $username;
my $password;
my $verbosity = 2; # verbosity level. 0 is silent. 1 is error only. 2 is updates. 3 is process, more is noisy.
my $post; # Is this an actual run?
my $cachepath = "/tmp/";
my $cacheonly; # Don't query the wiki for data. Just pull from cache.
my $nosvn;
my $rebuild_all;
my $opt_matrix = {
"wiki=s" => \$wiki,
"username=s" => \$username,
"password=s" => \$password,
"verbosity=s" => \$verbosity,
"post" => \$post,
"cacheonly" => \$cacheonly,
"nosvn" => \$nosvn,
"cachepath=s" => \$cachepath,
"rebuildall" => \$rebuild_all };
my $result = GetOptions( %$opt_matrix );
$options{'wiki'} = $wiki;
$options{'nosvn'} = $nosvn
if defined $nosvn;
$options{'username'} = $username
if defined $username;
$options{'password'} = $password
if defined $password;
$options{'post'} = $post
if defined $post;
$options{'verbosity'} = $verbosity;
$cachepath .= '/'
if('/' ne substr($cachepath, -1, 1));
$options{'cachepath'} = $cachepath;
$options{'cacheonly'} = $cacheonly
if defined $cacheonly;
$options{'rebuild_all'} = $rebuild_all
if defined $rebuild_all;
my $usage = "Usage: $0 (options)\n The available options are:\n";
$usage .= "\t--$_\n"
foreach (keys %$opt_matrix);
$usage .= "Username and password are required if you need to pull data from the wiki. Wiki defaults to Rosetta Code.\n";
my $wikineeded;
$wikineeded = "yes"
unless ( exists $options{'cacheonly'} );
$wikineeded = "yes"
if ( exists $options{'post'} );
if(defined $wikineeded)
{
unless(exists $options{'username'} and exists $options{'password'})
{
die $usage;
}
}
}
sub main {
&slumber;

unless (@ARGV) { @ARGV = @_; }

getopt();
#Statistic tracking.
# Tracking for svn checkin at end.
# We *should* be the only ones writing to the cache path for now.
# Anyone else should have their own checkout of the data,
# and test runs are done with --cacheonly and --nosvn.
opendir(my $cachedir, $options{'cachepath'})
or die "Unable to open cache directory";
@initialcache = readdir($cachedir);
closedir($cachedir);
&out(scalar @initialcache . " categories initially cached\n", 4);
# Get our editor
$editor = &geteditor();
# Get a complete listing of the tasks.
&out("Getting tasks\n", 3);
$alltasks = &getcategory('Category:Programming Tasks');
# Get a complete listing of the languages.
&out("Getting the languages.\n", 3);
$alllanguages= &getcategory('Category:Programming Languages');
# Quick check. Did we add or lose any tasks? If so, we've got to recalc *all*
# of the pages. :-/
unless(exists $options{'rebuild_all'}) {
my $cacheddata = &getcacheddata('Category:Programming Tasks');
my ($added, $removed) = &diffcat($alltasks, $cacheddata);
$task_count_change = scalar @$added + scalar @$removed;
}
else {
$task_count_change = 1; # It just needs to be nonzero...
}

# We want the language name, not the fully-qualified wiki name.
$_ =~ s/^Category:// foreach (@$alllanguages);

# Get a list of the languages for which we've already provided bodies for the related omit categories.
# Store it as a hash, so the lookup will be faster.
my $omitcatcontents = &getcategory('Category:Maintenance/OmitCategoriesCreated');
my %createdomitcategories = map {$_, 1} @$omitcatcontents;

&out("Identifying work to do\n", 3);

foreach my $lang (@$alllanguages) {
my $val = &getlangwork($lang);
next unless defined $val;
$impldiff{$lang} = $val;
}

&processimplediff(\%impldiff);


my $runtime = time() - $starttime;
my $statsdata = "Pages edited last run: $pagesedited<br/>Time to post all per-language updates: $runtime seconds<br/>Category pulls: $categorypulls<br/>Cache hits: $cachehits<br/>";
&out("Updating stats page. Runtime ($runtime), Pages edited ($pagesedited)\n", 2);

&postpage("User:ImplSearchBot/Stats", $statsdata, "Updating stats data", 0);


&out("Updating bot code page\n", 4);

open my $sourcefile, '<', $0
or die "Finished without updating bot source page";

my $botsource;
$botsource .= $_ while <$sourcefile>;

close $sourcefile;

my $tag = "lang";

&postpage("User:ImplSearchBot/Code", "<$tag perl>$botsource</$tag>", 0);

my $logdata = "<pre>" . join( "\n", @logoutput) . "</pre>";

&postpage("User:ImplSearchBot/Log", $logdata, 0);

&out("Updating cache\n", 4);
&commitcache();

&out("Done\n", 3);

exit 0
}

if (!caller) { exit main }
</lang>
</lang>

Latest revision as of 21:16, 26 August 2009

<lang perl>#!/usr/bin/perl -w use strict; use Getopt::Long; use MediaWiki::Bot; use JSON qw/to_json from_json/;

our $editor; our $starttime = time; our $pagesedited = 0; our $categorypulls = 0; our $cachehits = 0; our $task_count_change; our $alltasks; our %impldiff; our %createdomitcategories; our @initialcache; our $alllanguages; our @logoutput;

my %options;

  1. ---------------------
  2. These are all the functions that break up our work
  3. into logical chunks.
  4. ---------------------
  1. Called in a *lot* of places to check the machine for overload.

sub slumber { # Snooze. while(&snooze) {}; }

  1. A reversed snooze buton

sub snooze { open my $loadavg, '<', '/proc/loadavg' or die "Failed to check load average: $!"; my $loadstr = <$loadavg>; close $loadavg;

# Wait one minute for every process in the wait queue. my @avgs = split / +/, $loadstr, 3;

my $sleeptime = 0;

# Sleep at least one minute for each process in the wait queue. $sleeptime = $avgs[0] * 60;


# If we're currently under notable load if($sleeptime > 30) { print "Calculated sleeptime: $sleeptime\n"; print "Load averages: " . join(' ', @avgs) . " ... Sleeping $sleeptime seconds\n"; sleep $sleeptime; return $sleeptime; }

return 0; }

  1. Builds a simple hash ref associating a page name with body.
  2. Used to help us prepare our postings in one pass, then commit them
  3. in a second pass.

sub build_posting { &slumber;

   my $name = shift;
   my $body = shift;
   
   return {$name => $body};

}

  1. Prepares the template body for the unimplemented data.

sub prep_unimp_posting { &slumber;

   my $impldiff = shift;
   my $language = shift;
   my $unimplisting = "";
   
   foreach my $taskname (@$alltasks)
   {
       # If it's a category task, the task name will be slightly different.
       my $baretaskname = $taskname;
       $baretaskname =~ s/^Category://;
       my $implpage = $taskname;
       $implpage =~ s/^Category:(.*)/$1\/$language/;
 #      my $escapedImplPage = $implpage;
#       $escapedImplPage =~ s/ /_/g;
       
       my $link;

$link = "Template:Unimpl task link";

  1. $link = "$implpage";
       # Add the task to the unimplemented list, if it's unimplemented, and if it's not in the omit list.
       
       unless(exists $impldiff->{$language}->{'impl'}->{$implpage} or exists $impldiff->{$language}->{'omit'}->{$implpage})
       {
           $unimplisting .= "* $link\n";
       }
       
   }
   
   
   return &build_posting("Template:" . &get_unimpl_template_name($language), $unimplisting);

}

  1. Prepares the template body for the listings of omitted data for a page.

sub prep_omit_posting { &slumber;

   my $implediff = shift;
   my $language = shift;
   my $omitlisting = "";
   my $omittemplatename = &get_omit_template_name($language);
   
   foreach my $taskname (@$alltasks)
   {
       # We want the task name, not the fully-qualified wiki name.
       my $baretaskname = $taskname;
       $baretaskname =~ s/^Category://;
       # Add the task to the omission list, if it's omitted.
       my $escapedTaskName = $baretaskname;
       $escapedTaskName =~ s/ /_/g;
       $omitlisting .= "* $baretaskname\n"
           if(exists $impldiff{$language}->{'omit'}->{$taskname})
   }
   
   # Note that there's no data in the template.
   $implediff->{$language}->{'omit'} = 0
       if( "" eq $omitlisting );
   
   return &build_posting("Template:$omittemplatename", $omitlisting);

}

  1. Prepares the page body that folks look at to find out what changed for a language.

sub prep_listing_posting { &slumber;

   my $impldiff = shift;
   my $language = shift;
   
   my $targetcount = $impldiff->{$language}->{'target_count'};
   my $unimpcount = $impldiff->{$language}->{'unimp_count'};
   
   # Prepare template fields
   my $langfield = "|$language";
   my $unimpfield = "|$unimpcount";
   my $tcfield = "|$targetcount";
   
   my $impperccalc = 0;
   $impperccalc = (($targetcount - $unimpcount) / $targetcount) * 100

unless ($targetcount == 0);

   my $imppercfield = sprintf "|%u", $impperccalc;
   
   # Prepare the listing page format.
   my $pagedata = 'Template:Unimpl header' . $langfield . $unimpfield . $tcfield . $imppercfield . '';
   $pagedata .= "Template:" . &get unimpl template name($language) . "$unimpfield";
   
   if (0 != $impldiff->{$language}->{'omit'})
   {
       $pagedata .= "Template:Omit header" . "$langfield";
       $pagedata .= "Template:" . &get omit template name($language) ."";
   }
   
   $pagedata .= "Template:Unimpl footer$langfield";
   
   return &build_posting(&get_listing_name($language), $pagedata);

}

  1. Prepare the body of the omit category.

sub prep_omit_cat_posting { &slumber;

   my $implediff = shift;
   my $language = shift;
   return &build_posting(&get_omit_cat_name($language), "Tasks listed here have been marked as "un-implementable" in $language. Solutions may be impossible to do, too complex to be of any valuable instruction in $language, or prohibited by the task definition.

But hey; if you think you can prove us wrong, go for it. :-) "); }

  1. Prepare the page title for the page folks look at to see what change what.

sub get_listing_name { &slumber;

   my $language = shift;
   return "Tasks not implemented in $language";

}

  1. Prepare the template name for the omit listing body

sub get_omit_template_name { &slumber;

   my $language = shift;
   return "unimp_omit_body_$language";

}

  1. Prepare the template name for the unimplimended listing body

sub get_unimpl_template_name { &slumber;

   my $language = shift;
   return "unimp_body_$language";

}

  1. Return the name of the category to find the omitted pages for a particular language.

sub get_omit_cat_name { &slumber;

   my $language = shift;
   return "Category:$language/Omit";

}

  1. Prepare all the pages associated with a specific language.

sub process_language { &slumber;

   my $implediff = shift;
   my $language = shift;
   
   
   my $unimpl = &prep_unimp_posting($implediff, $language);
   my $omit = &prep_omit_posting($implediff, $language);
   my $listing = &prep_listing_posting($implediff, $language);
   
   my %langpostings = ( %$unimpl, %$omit, %$listing );
   
   
   unless ( exists $createdomitcategories{&get_omit_cat_name($language)} )
   {
       my $omit_cat = &prep_omit_cat_posting($implediff, $language);
       %langpostings = ( %langpostings, %$omit_cat );
   }
   
   return \%langpostings;

}

sub wikitxt_pathname { &slumber;

   my $page_name = $_[0];
   "test/" . sanitizenamefs($page_name) . ".wikitxt";

}

  1. Prepare and post all of the core pages for this run.

sub processimplediff { &slumber;

   my $implediff = shift;
   my %work;
   
   # Prepare all our work.
   foreach my $language (keys %$implediff)
   {
       # &out("Preparing data for:$language\n", 4);
       my $workitem = &process_language($implediff, $language);
       %work = ( %work, %$workitem );
   }
   # Now that we've prepared all our work, commit it.
   foreach my $pagename (keys %work)
   {
       &postpage($pagename, $work{$pagename}, "Updating $pagename", 0);
   }

}

  1. Return the Mediawiki editor object.

sub geteditor { &slumber;

   # If we're not posting, and we're only drawing from cache
   # We don't actually need to pull from the wiki.
   if( exists $options{'cacheonly'} )
   {
       return undef
           unless exists $options{'post'};
   }
   
   # Handles interaction with the wiki.
   # Note that I had to modify HTTP::Message to make it work
   # HTTP::Message silently failed when presented by MW
   # with an encoding type of "application/json" or some such.
   &out("Creating editor\n", 3);
   my $editor = MediaWiki::Bot->new('ImpleSearchBot');
   $editor->{debug} = 1;
   
   # Tell the editor to edit Rosetta Code.  I'm sure Wikipedia didn't like
   # my initial attempts from before I added this line.
   &out("Trying to set wiki.\n",3);
   $editor->set_wiki('rosettacode.org','mw');
   
   # If we're not posting, we don't need to log in if we're pulling from cache.
   # Otherwise, attempt to log in.
   &out("Trying to log in.\n", 3);
   my $loginres = $editor->login($options{'username'}, $options{'password'});
   
   die "Unable to login: " . $loginres
       unless $loginres == 0;
   
   return $editor;

}

  1. Simple logging infrastructure. Current sends to STDWARN or STDOUT, which cron
  2. emails to the user, which gets forwarded to Short Circuit. (Anyone want a copy?)

sub out { &slumber;

   my $string = shift;
   my $loglevel = shift;
   
   chomp $string;
   push @logoutput, ($string);
   
   if($options{'verbosity'} >= $loglevel)
   {
       if($loglevel == 1)
       {
           warn $string; # use stderr.
       }
       else
       {
           print "$string\n";
       }
   }

}

  1. Many popular filesystems can't andle : and \ in filenames.
  2. Since I plan to open the SVN repo we save to the rest of the world at some point,
  3. I'm trying to make sure the files are representable.

sub sanitizenamefs { &slumber;

   my $pagename = shift;
   $pagename =~ tr/:\//__/;
   return $pagename;

}

  1. Find all the entries that are in the second list ref, but not the first.

sub diffcat_simple { &slumber;

   my $first = shift;
   my $second = shift;
   my %firsthash = map { $_, 1 } @$first;
   
   my @new = ();
   
   foreach my $secondelement (@$second)
   {
       push @new, $secondelement
           unless exists $firsthash{$secondelement};
   }
   
   return @new;

}

  1. Find all the entries that are in one listref, but not the other.

sub diffcat { &slumber;

   my $first = shift;
   my $second = shift;
   my @onlyinfirst = &diffcat_simple($second, $first);
   my @onlyinsecond = &diffcat_simple($first, $second);
   return (\@onlyinfirst, \@onlyinsecond);

}

sub slurp_file { &slumber;

   my $pathname = $_[0];
   open(my $stream, "<", $pathname) or do {warn"Can't open '$pathname': $!\n";
                                           return};
   my $ret;
   while (1) {
       my $len = read($stream, my $buf, 1024);
       if (!defined($len)) { die "I/O error while reading '$pathname': $!"; }
       elsif ($len == 0) { last }
       else { $ret .= $buf }
   }
   $ret;

}

  1. Post a page (or save it to disk, if we're testing.)

sub postpage { &slumber;

   my $pagename = shift;
   my $pagedata = shift;
   my $remark = shift;
   my $minoredit = shift;
   
   
   # MediaWiki won't let us create blank, empty pages.
   # But since we don't want to query (or cache) to see if
   # the page already exists, we'll just add an HTML
   # non-breaking-space entity if the page is truly empty.
   $pagedata = " "
       if(0 == length $pagedata);
   
   ++$pagesedited;
   
   unless( exists $options{'post'} )
   {
       $pagename = wikitxt_pathname($pagename);
       # save it to disk, and out of the way.
       
       &out("Saving: $pagename\n", 2);
       
       open my $outfile, '>', $pagename
           or &out("Failed to open $pagename: $!", 1);
       
       return unless defined $outfile;
       
       print $outfile $pagedata;
       close $outfile;
   }
   else {
       &out("Posting $pagename\n", 2);
   
   $editor->edit($pagename, $pagedata, "ImplSearchBot:$remark", $minoredit)
       or &out("Failed to post page: " . $editor->{'errstr'}, 1);
   }

}

  1. This allows us to compare site state between now and when we
  2. last ran, so that we can learn to avoid doing unnecessary work.
  3. (Saves on server resources and bloated edit statistics...)
  4. Also, the data gets saved to an SVN repo, so that multiple bots can
  5. use the history. We're using JSON, as JSON has broader
  6. cross-language support than Data::Dumper, making it easier for others
  7. to use the data.

sub cachedata { &slumber; my $dataname = shift; my $data = shift; my $filename = $options{'cachepath'} . &sanitizenamefs($dataname . ".json"); unless( exists $options{'post'} ) { # This is just a test run. Best not to overwrite our cache. &out("Skipping caching of $dataname to $filename...", 4); return; } my $outfile; unless(open $outfile, '>', $filename) { &out("Failed to open $filename for caching: $!\n", 1); return; }

print $outfile to_json($data); close $outfile; &out(scalar @$data . " members cached to $filename.\n", 5); }

  1. Return data we cached previously.

sub getcacheddata { &slumber; my $dataname = shift; my $filename = $options{'cachepath'} . &sanitizenamefs($dataname . ".json"); &out("Getting cached data for $dataname from $filename...", 4); my $infile; unless (open $infile, '<', $filename) { &out("Failed to load cached data $filename: $!\n", 5); return []; }

my $jsondata; $jsondata .= $_ while <$infile>; close $infile;

my $cacheddata = from_json($jsondata); &out(scalar @$cacheddata . " cache members retrieved from $filename\n", 5);

++$cachehits; return $cacheddata; }

  1. Report the changes between two categories.
  2. More interesting than "x added, y removed"

sub reportcatchanges { &slumber; my $category = shift; my $old = shift; my $new = shift;

my ($removed, $added) = &diffcat($old, $new);

my $out = "";

$out .= "Removed from $category:\n" if( scalar @$removed > 0 ); $out .= "$_\n" foreach (@$removed);

$out .= "Added to $category:\n" if( scalar @$added > 0 ); $out .= "$_\n" foreach (@$added);

&out($out, 2) if("" ne $out); }

  1. Pull the category data, or cached data if we're not pulling from the wiki.

sub getcategory { &slumber; my $categoryname = shift; &out("Getting category contents for $categoryname...", 4);

my $old; my $new;

if( exists $options{'cacheonly'} ) { # Return the cache data if we're not supposed to query the database. $old = []; $new = &getcacheddata($categoryname); } else { $old = &getcacheddata($categoryname); $new = [$editor->get_pages_in_category($categoryname)]; ++$categorypulls; }

# &reportcatchanges($categoryname, $old, $new);

&cachedata("$categoryname", $new);

&out(scalar @$new . " members returned for $categoryname\n", 5);

return $new; }

  1. Find if this category changed, report its contents if it has.

sub getwork { &slumber; my $categoryname = shift; &out("Getting work for $categoryname.\n",4); my $cacheddata = &getcacheddata($categoryname);

# If we're on a cache-only basis, we'll just say we have no old data, # and that our cached data is our new data. my $olddata; my $newdata; if (exists $options{'cacheonly'}) { $olddata = []; $newdata = $cacheddata; } else { $olddata = $cacheddata; $newdata = &getcategory($categoryname); }

my ($removed, $added) = &diffcat($olddata, $newdata); my $impl_count_change = scalar @$removed + @$added; my $work = []; $work = $newdata if(($impl_count_change + $task_count_change) > 0);

&out(scalar @$work . " items to process for $categoryname.\n", 3);

return @$work; }

  1. If changes occurred, the info is more important than if they didn't.

sub getloglevelfromdiff { &slumber; my $base = shift; my $first = shift; my $second = shift; return $base if((scalar @$first + scalar @$second) > 0); return $base + 1; }

  1. Find all the work items for a given language.

sub getlangwork { &slumber; my $language = shift; &out("Getting lang work for $language\n", 4); my %impl; foreach my $workitem (&getwork("Category:$language")) { $impl{$workitem} = 1; } my %omit; foreach my $workitem (&getwork("Category:$language/Omit")) { $omit{$workitem} = 1; }

# No work to do? return undef if( ( 0 == scalar keys %omit) && (0 == scalar keys %impl));

# Language metadata my $unimpcount = scalar @$alltasks - scalar keys %impl; my $targetcount = (scalar @$alltasks - scalar keys %omit);


# work to do. return {'impl' => \%impl, 'omit' => \%omit,

      		'unimp_count' => $unimpcount,

'target_count' => $targetcount }; }

  1. Commit the cache.

sub commitcache { &slumber; # First, find out if we've added any files. my $cachepath = $options{'cachepath'}; opendir(CACHEDIR, $cachepath);

my @current = readdir(CACHEDIR); close(CACHEDIR);

# We need to run svn adds if we've created any new files. # Maybe we'll use SVN::Client some day. Not right now.

my ($added, $removed) = &diffcat(\@current, \@initialcache); &out("Detected " . scalar @$added . " new cache files and " . scalar @$removed . " removed\n",&getloglevelfromdiff(2, $added, $removed));

if ((scalar @$added + scalar @$removed ) > 0) { foreach my $cachefile (@$added) { &svn('add', $cachepath . $cachefile);; }

foreach my $cachefile (@$removed) { &svn('remove', $cachepath . $cachefile);; } }

&svn('ci', '--message="ImplSearchBot run"', $cachepath); &svn('update', $cachepath); }


  1. Wrap svn commands so we can log them.

sub svn { &slumber; return if(exists $options{'nosvn'}); my @args = @_;

my $string = "system 'svn'"; $string .= ", '$_'" foreach (@args);

$string .= "\n";

&out($string, 3);

system 'svn', @args if(exists $options{'post'}); }

sub getopt { &slumber;

   #I don't care to pollute my global scope.
   my $wiki = 'rosettacode.org';
   my $username;
   my $password;
   my $verbosity = 2; # verbosity level.  0 is silent. 1 is error only. 2 is updates. 3 is process, more is noisy.
   my $post; # Is this an actual run?
   my $cachepath = "/tmp/";
   my $cacheonly; # Don't query the wiki for data. Just pull from cache.
   my $nosvn;
   my $rebuild_all;
   
   my $opt_matrix = {
       "wiki=s"	=> \$wiki,
       "username=s"	=> \$username,
       "password=s"	=> \$password,
       "verbosity=s"	=> \$verbosity,
       "post"		=> \$post,
       "cacheonly"	=> \$cacheonly,
       "nosvn"		=> \$nosvn,
       "cachepath=s"	=> \$cachepath,
       "rebuildall"	=> \$rebuild_all	};
   
   my $result = GetOptions( %$opt_matrix );
   $options{'wiki'} = $wiki;
   
   $options{'nosvn'} = $nosvn
       if defined $nosvn;
   
   $options{'username'} = $username
       if defined $username;
   
   $options{'password'} = $password
       if defined $password;
   
   $options{'post'} = $post
       if defined $post;
   
   $options{'verbosity'} = $verbosity;
   
   $cachepath .= '/'
       if('/' ne substr($cachepath, -1, 1));
   
   $options{'cachepath'} = $cachepath;
   
   $options{'cacheonly'} = $cacheonly
       if defined $cacheonly;
   
   $options{'rebuild_all'} = $rebuild_all
       if defined $rebuild_all;
   
   
   my $usage = "Usage: $0 (options)\n The available options are:\n";
   $usage .= "\t--$_\n"
       foreach (keys %$opt_matrix);
   
   $usage .= "Username and password are required if you need to pull data from the wiki.  Wiki defaults to Rosetta Code.\n";
   
   
   my $wikineeded;
   
   $wikineeded = "yes"
       unless ( exists $options{'cacheonly'} );
   
   $wikineeded = "yes"
       if ( exists $options{'post'} );
   
   if(defined $wikineeded)
   {
       unless(exists $options{'username'} and exists $options{'password'})
       {
           die $usage;
       }
   }

}

sub main { &slumber;

   unless (@ARGV) { @ARGV = @_; }
   getopt();
   
  1. Statistic tracking.
  1. Tracking for svn checkin at end.
  2. We *should* be the only ones writing to the cache path for now.
  3. Anyone else should have their own checkout of the data,
  4. and test runs are done with --cacheonly and --nosvn.
   opendir(my $cachedir, $options{'cachepath'})
       or die "Unable to open cache directory";
   @initialcache = readdir($cachedir);
   closedir($cachedir);
   
   &out(scalar @initialcache . " categories initially cached\n", 4);
   
  1. Get our editor
   $editor = &geteditor();
   
  1. Get a complete listing of the tasks.
   &out("Getting tasks\n", 3);
   $alltasks = &getcategory('Category:Programming Tasks');
   
  1. Get a complete listing of the languages.
   &out("Getting the languages.\n", 3);
   $alllanguages= &getcategory('Category:Programming Languages');
   
  1. Quick check. Did we add or lose any tasks? If so, we've got to recalc *all*
  2. of the pages. :-/
   unless(exists $options{'rebuild_all'}) {
       my $cacheddata = &getcacheddata('Category:Programming Tasks');
       my ($added, $removed) = &diffcat($alltasks, $cacheddata);
       $task_count_change = scalar @$added + scalar @$removed;
   }
   else {
       $task_count_change = 1; # It just needs to be nonzero...
   }
  1. We want the language name, not the fully-qualified wiki name.
   $_ =~ s/^Category:// foreach (@$alllanguages);
  1. Get a list of the languages for which we've already provided bodies for the related omit categories.
  2. Store it as a hash, so the lookup will be faster.
   my $omitcatcontents = &getcategory('Category:Maintenance/OmitCategoriesCreated');
   my %createdomitcategories = map {$_, 1} @$omitcatcontents;
   &out("Identifying work to do\n", 3);
   foreach my $lang (@$alllanguages) {
       my $val = &getlangwork($lang);
       next unless defined $val;
       $impldiff{$lang} = $val;
   }
   &processimplediff(\%impldiff);


   my $runtime = time() - $starttime;
   my $statsdata = "Pages edited last run: $pagesedited
Time to post all per-language updates: $runtime seconds
Category pulls: $categorypulls
Cache hits: $cachehits
"; &out("Updating stats page. Runtime ($runtime), Pages edited ($pagesedited)\n", 2);
   &postpage("User:ImplSearchBot/Stats", $statsdata, "Updating stats data", 0);


   &out("Updating bot code page\n", 4);
   open my $sourcefile, '<', $0
       or die "Finished without updating bot source page";
   my $botsource;
   $botsource .= $_ while <$sourcefile>;
   close $sourcefile;
   my $tag = "lang";
   &postpage("User:ImplSearchBot/Code", "<$tag perl>$botsource</$tag>", 0);

my $logdata = "

" . join( "\n", @logoutput) . "

";

   &postpage("User:ImplSearchBot/Log", $logdata, 0);
   &out("Updating cache\n", 4);
   &commitcache();
   &out("Done\n", 3);
   exit 0

}

if (!caller) { exit main } </lang>