User:ImplSearchBot/Code

From Rosetta Code
Revision as of 10:13, 22 March 2009 by rosettacode>ImplSearchBot (ImplSearchBot:0)

<lang perl>#!/usr/bin/perl -w use strict; use Getopt::Long; use MediaWiki::Bot; use JSON qw/to_json from_json/;

my $usage = "Usage: $0 --username=(username) --password=(password) [--posttosite=yes]";

my %options;

  1. I don't care to pollute my global scope.

{ my $wiki = 'rosettacode.org'; my $username; my $password; my $verbosity = 2; # verbosity level. 0 is silent. 1 is error only. 2 is updates. 3 is process, more is noisy. my $post; # Is this an actual run? my $cachepath = "cache/"; my $cacheonly; # Don't query the wiki for data. Just pull from cache. my $nosvn; my $result = GetOptions( "wiki=s" => \$wiki, "username=s" => \$username, "password=s" => \$password, "verbosity=s" => \$verbosity, "post" => \$post, "cacheonly" => \$cacheonly, "nosvn" => \$nosvn, "cachepath=s" => \$cachepath); $options{'wiki'} = $wiki;

$options{'nosvn'} = $nosvn if defined $nosvn;

$options{'username'} = $username if defined $username;

$options{'password'} = $password if defined $password;

$options{'post'} = $post if defined $post;

$options{'verbosity'} = $verbosity;

$cachepath .= '/' if('/' ne substr($cachepath, -1, 1));

$options{'cachepath'} = $cachepath;

$options{'cacheonly'} = $cacheonly if defined $cacheonly;


# This could be reversed as a qualified statement, but I don't know # if that it'd be any less ugly. unless( exists $options{'cacheonly'}) { if( exists $options{'post'} ) { unless(exists $options{'username'} and exists $options{'password'}) { use Data::Dumper; print Dumper(%options); die $usage; } } } }


  1. Statistic tracking.

my $starttime = time; my $pagesedited = 0; my $categorypulls = 0; my $cachehits = 0;

  1. Tracking for svn checkin at end.
  2. We *should* be the only ones writing to the cache path for now.
  3. Anyone else should have their own checkout of the data,
  4. and test runs are done with --cacheonly and --nosvn.

opendir(CACHEDIR, $options{'cachepath'})

 or die "Unable to open cache directory";

my @initialcache = readdir(CACHEDIR); closedir(CACHEDIR);

&out(scalar @initialcache . " categories initially cached\n", 4);

  1. Get our editor

my $editor = &geteditor();

  1. Get a complete listing of the tasks.

&out("Getting tasks\n", 3); my $alltasks = &getcategory('Category:Programming Tasks');

  1. Get a complete listing of the languages.

&out("Getting the languages.\n", 3); my $alllanguages = &getcategory('Category:Programming Languages');

  1. We want the language name, not the fully-qualified wiki name.

$_ =~ s/^Category:// foreach (@$alllanguages);

  1. Get a list of the languages for which we've already provided bodies for the related omit categories.
  2. Store it as a hash, so the lookup will be faster.

my $omitcatcontents = &getcategory('Category:Maintenance/OmitCategoriesCreated'); my %createdomitcategories = map {$_, 1} @$omitcatcontents;

&out("Identifying work to do\n", 3);

my %impldiff;

foreach my $lang (@$alllanguages) { my $val = &getlangwork($lang); next unless defined $val; $impldiff{$lang} = $val; }

&processimplediff(\%impldiff);


  1. To add here: Post stats on activities:
  2. 2. Time last run started

my $runtime = time() - $starttime; my $statsdata = "Pages edited last run: $pagesedited
Time to post all per-language updates: $runtime seconds
Category pulls: $categorypulls
Cache hits: $cachehits
"; &out("Updating stats page. Runtime ($runtime), Pages edited ($pagesedited)\n", 2);

&postpage("User:ImplSearchBot/Stats", $statsdata, "Updating stats data", 0);


&out("Updating bot code page\n", 4);

open my $sourcefile, '<', $0 or die "Finished without updating bot source page";

my $botsource; $botsource .= $_ while <$sourcefile>;

close $sourcefile;

my $tag = "lang";

&postpage("User:ImplSearchBot/Code", "<$tag perl>$botsource</$tag>", 0);

&out("Updating cache\n", 4); &commitcache();

&out("Done\n", 3);

exit(0);

sub build_posting { my $name = shift; my $body = shift;

return {$name => $body}; }

sub prep_unimp_posting { my $impldiff = shift; my $language = shift; my $unimplisting = "";

foreach my $taskname (@$alltasks) { # We want the task name, not the fully-qualified wiki name. my $baretaskname = $taskname; $baretaskname =~ s/^Category://;

# Add the task to the unimplemented list, if it's unimplemented, and if it's not in the omit list. $unimplisting .= "* $baretaskname\n" unless(exists $impldiff->{$language}->{$taskname} or exists $impldiff->{$language}->{'omit'}->{$taskname});

}


return &build_posting("Template:" . &get_unimpl_template_name($language), $unimplisting); }

sub prep_omit_posting { my $implediff = shift; my $language = shift; my $omitlisting = ""; my $omittemplatename = &get_omit_template_name($language);

foreach my $taskname (@$alltasks) { # We want the task name, not the fully-qualified wiki name. my $baretaskname = $taskname; $baretaskname =~ s/^Category://; # Add the task to the omission list, if it's omitted. $omitlisting .= "* $baretaskname\n" if(exists $impldiff{$language}->{'omit'}->{$taskname}) }

# Note that there's no data in the template. $implediff->{$language}->{'omit'} = 0 if( "" eq $omitlisting );

return &build_posting("Template:$omittemplatename", $omitlisting); }

sub prep_listing_posting { my $impldiff = shift; my $language = shift;

my $targetcount = $impldiff->{$language}->{'target_count'}; my $unimpcount = $impldiff->{$language}->{'unimp_count'};

# Prepare template fields my $langfield = "|$language"; my $unimpfield = "|$unimpcount"; my $tcfield = "|$targetcount";

my $impperccalc = 0; $impperccalc = (($targetcount - $unimpcount) / $targetcount) * 100 unless ($targetcount == 0);

my $imppercfield = sprintf "|%u", $impperccalc;

# Prepare the listing page format. my $pagedata = 'Template:Unimpl header' . $langfield . $unimpfield . $tcfield . $imppercfield . ''; $pagedata .= "Template:" . &get unimpl template name($language) . "$unimpfield";

if (0 != $impldiff->{$language}->{'omit'}) { $pagedata .= "Template:Omit header" . "$langfield"; $pagedata .= "Template:" . &get omit template name($language) .""; }

$pagedata .= "Template:Unimpl footer$langfield";

return &build_posting(&get_listing_name($language), $pagedata); }

sub prep_omit_cat_posting { my $implediff = shift; my $language = shift; return &build_posting(&get_omit_cat_name, "Tasks listed here have been marked as "un-implementable" in $language. Solutions may be impossible to do, too complex to be of any valuable instruction in $language, or prohibited by the task definition.

But hey; if you think you can prove us wrong, go for it. :-) "); }

sub get_listing_name { my $language = shift; return "Tasks not implemented in $language"; }

sub get_omit_template_name { my $language = shift; return "unimp_omit_body_$language"; }

sub get_unimpl_template_name { my $language = shift; return "unimp_body_$language"; }

sub get_omit_cat_name { my $language = shift; return "Category:$language/Omit"; }

sub process_language { my $implediff = shift; my $language = shift;


my $unimpl = &prep_unimp_posting($implediff, $language); my $omit = &prep_omit_posting($implediff, $language); my $listing = &prep_listing_posting($implediff, $language);

my %langpostings = ( %$unimpl, %$omit, %$listing );


unless ( exists $createdomitcategories{&get_omit_cat_name($language)} ) { my $omit_cat = &prep_omit_cat_posting($implediff, $language); %langpostings = ( %langpostings, %$omit_cat ); }

return \%langpostings; }

sub processimplediff { my $implediff = shift; my %work; foreach my $language (keys %impldiff) { &out("Preparing data for:$language\n", 4); my $workitem = &process_language($implediff, $language); %work = ( %work, %$workitem ); }

foreach my $pagename (keys %work) { &postpage($pagename, $work{$pagename}, "Updating $pagename", 0); } }

sub geteditor { # If we're not posting, and we're only drawing from cache # We don't actually need to pull from the wiki. if( exists $options{'cacheonly'} ) { return undef unless exists $options{'post'}; }

# Handles interaction with the wiki. # Note that I had to modify HTTP::Message to make it work # HTTP::Message silently failed when presented by MW # with an encoding type of "application/json" or some such. &out("Creating editor\n", 3); my $editor = MediaWiki::Bot->new('ImpleSearchBot'); $editor->{debug} = 1;

# Tell the editor to edit Rosetta Code. I'm sure Wikipedia didn't like # my initial attempts from before I added this line. &out("Trying to set wiki.\n",3); $editor->set_wiki('rosettacode.org','w');

# If we're not posting, we don't need to log in if we're pulling from cache. # Otherwise, attempt to log in. &out("Trying to log in.\n", 3); my $loginres = $editor->login($options{'username'}, $options{'password'});

die "Unable to login: " . $loginres unless $loginres == 0;

return $editor; }

sub out { my $string = shift; my $loglevel = shift;

chomp $string;

if($options{'verbosity'} >= $loglevel) { if($loglevel == 1) { warn $string; # use stderr. } else { print "$string\n"; } } }

sub sanitizenamefs { my $pagename = shift; $pagename =~ tr/:\//__/; return $pagename; }

sub diffcat_simple { my $first = shift; my $second = shift; my %firsthash = map { $_, 1 } @$first;

my @new = ();

foreach my $secondelement (@$second) { push @new, $secondelement unless exists $firsthash{$secondelement}; }

return @new; }

sub diffcat { my $first = shift; my $second = shift; my @onlyinfirst = &diffcat_simple($second, $first); my @onlyinsecond = &diffcat_simple($first, $second); return (\@onlyinfirst, \@onlyinsecond); }

sub postpage { my $pagename = shift; my $pagedata = shift; my $remark = shift; my $minoredit = shift;

++$pagesedited;

unless( exists $options{'post'} ) { # save it to disk, and out of the way. $pagename = "test/" . &sanitizenamefs($pagename); $pagename .= ".wikitxt";

&out("Saving: $pagename\n", 2);

open my $outfile, '>', $pagename or &log("Failed to open $pagename: $!", 1);

return unless defined $outfile;

print $outfile $pagedata; close $outfile; } else { &out("Posting $pagename\n", 2);

$editor->edit($pagename, $pagedata, "ImplSearchBot:$remark", $minoredit) or &out("Failed to post page: " . $editor->{'errstr'}, 1); } }

  1. This allows us to compare site state between now and when we
  2. last ran, so that we can learn to avoid doing unnecessary work.
  3. (Saves on server resources and bloated edit statistics...)
  4. Also, I plan on publishing the cache files in a version control
  5. system like SVN or Git, to allow multiple bots
  6. to share the data and to provide history. We're using JSON
  7. Instead of Perl's native Data::Dumper as JSON has broader
  8. cross-language support, making it easier for others to use the data.
  9. SVN has the advantage that I already know how to use it.
  10. Git has the advantage in that I can use GitHub and not tax
  11. my Slice with Git traffic, and I don't have to punch a hole
  12. in the firewall to access more services.

sub cachedata { my $dataname = shift; my $data = shift; my $filename = $options{'cachepath'} . &sanitizenamefs($dataname . ".json"); unless( exists $options{'post'} ) { # This is just a test run. Best not to overwrite our cache. &out("Skipping caching of $dataname to $filename...", 4); return; } my $outfile; unless(open $outfile, '>', $filename) { &out("Failed to open $filename for caching: $!\n", 1); return; }

print $outfile to_json($data); close $outfile; &out(scalar $data . " members cached to $filename.\n", 5); }

sub getcacheddata { my $dataname = shift; my $filename = $options{'cachepath'} . &sanitizenamefs($dataname . ".json"); &out("Getting cached data for $dataname from $filename...", 4); my $infile; unless (open $infile, '<', $filename) { &out("Failed to load cached data $filename: $!\n", 1); return []; }

my $jsondata; $jsondata .= $_ while <$infile>; close $infile;

my $cacheddata = from_json($jsondata); &out(scalar @$cacheddata . " cache members retrieved from $filename\n", 5);

++$cachehits; return $cacheddata; }

sub getcategory { my $categoryname = shift;

# Return the cache data if we're not supposed to query the database. return &getcacheddata($categoryname) if( exists $options{'cacheonly'} );

&out("Getting category contents for $categoryname...", 4); my @categorycontents = $editor->get_pages_in_category($categoryname); &out(scalar @categorycontents . " members retrieved for $categoryname\n", 5);

my ($removed, $added) = &diffcat(&getcacheddata($categoryname), \@categorycontents);

&out(scalar @$removed . " removed, " . @$added . " added to $categoryname\n", &getloglevelfromdiff(2, $removed, $added));

++$categorypulls;

&cachedata("$categoryname", \@categorycontents);

return \@categorycontents; }

sub getwork { my $categoryname = shift; &out("Getting work for $categoryname.\n",4); my $cacheddata = &getcacheddata($categoryname);

# If we're on a cache-only basis, we'll just say we have no old data, # and that our cached data is our new data. my $olddata; my $newdata; if (exists $options{'cacheonly'}) { $olddata = []; $newdata = $cacheddata; } else { $olddata = $cacheddata; $newdata = &getcategory($categoryname); }

my ($removed, $added) = &diffcat($olddata, $newdata); my $work = []; $work = $newdata if((scalar @$removed + scalar @$added) > 0);

&out(scalar @$work . " items to process for $categoryname.\n", 3);

return @$work; }

sub getloglevelfromdiff { my $base = shift; my $first = shift; my $second = shift; return $base if((scalar @$first + scalar @$second) > 0); return $base + 1; }

sub getlangwork { my $language = shift; &out("Getting lang work for $language\n", 4); my %impl; foreach my $workitem (&getwork("Category:$language")) { $impl{$workitem} = 1; } my %omit; foreach my $workitem (&getwork("Category:$language/Omit")) { $omit{$workitem} = 1; }

# No work to do? return undef if( ( 0 == scalar keys %omit) && (0 == scalar keys %impl));

# Language metadata my $unimpcount = scalar @$alltasks - scalar keys %impl; my $targetcount = (scalar @$alltasks - scalar keys %omit);


# work to do. return {'impl' => \%impl, 'omit' => \%omit,

      		'unimp_count' => $unimpcount,

'target_count' => $targetcount }; }

sub commitcache { # First, find out if we've added any files. my $cachepath = $options{'cachepath'}; opendir(CACHEDIR, $cachepath);

my @current = readdir(CACHEDIR); close(CACHEDIR);

# We need to run svn adds if we've created any new files. # Maybe we'll use SVN::Client some day. Not right now.

my ($added, $removed) = &diffcat(\@current, \@initialcache); &out("Detected " . scalar @$added . " new cache files and " . scalar @$removed . " removed\n",&getloglevelfromdiff(2, $added, $removed));

if ((scalar @$added + scalar @$removed ) > 0) { foreach my $cachefile (@$added) { &svn('add', $cachepath . $cachefile);; }

foreach my $cachefile (@$removed) { &svn('remove', $cachepath . $cachefile);; } }

&svn('ci', '--message="ImplSearchBot run"', $cachepath); &svn('update', $cachepath); }

sub svn { return if(exists $options{'nosvn'}); my @args = @_;

my $string = "system 'svn'"; $string .= ", '$_'" foreach (@args);

$string .= "\n";

&out($string, 3);

system 'svn', @args if(exists $options{'post'}); } </lang>