User:ImplSearchBot/Code: Difference between revisions
m (User:Short Circuit/ImplSearchBot moved to User:ImplSearchBot/ImplSearchBot: Moved to the user created for it.) |
m (ImplSearchBot:0) |
||
(37 intermediate revisions by 2 users not shown) | |||
Line 1:
<lang perl>#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use MediaWiki::Bot;
use JSON qw/to_json from_json/;
our $editor;
our $starttime = time;
our $pagesedited = 0;
our $categorypulls = 0;
our $cachehits = 0;
our $task_count_change;
our $alltasks;
our %impldiff;
our %createdomitcategories;
our @initialcache;
our $alllanguages;
our @logoutput;
my %options;
#---------------------
# These are all the functions that break up our work
# into logical chunks.
#---------------------
# Called in a *lot* of places to check the machine for overload.
sub slumber
{
# Snooze.
while(&snooze) {};
}
# A reversed snooze buton
sub snooze
{
open my $loadavg, '<', '/proc/loadavg'
or die "Failed to check load average: $!";
my $loadstr = <$loadavg>;
close $loadavg;
# Wait one minute for every process in the wait queue.
my @avgs = split / +/, $loadstr, 3;
my $sleeptime = 0;
# Sleep at least one minute for each process in the wait queue.
$sleeptime = $avgs[0] * 60;
# If we're currently under notable load
if($sleeptime > 30)
{
print "Calculated sleeptime: $sleeptime\n";
print "Load averages: " . join(' ', @avgs) . " ... Sleeping $sleeptime seconds\n";
sleep $sleeptime;
return $sleeptime;
}
return 0;
}
# Builds a simple hash ref associating a page name with body.
# Used to help us prepare our postings in one pass, then commit them
# in a second pass.
sub build_posting
{
&slumber;
my $name = shift;
my $body = shift;
return {$name => $body};
}
# Prepares the template body for the unimplemented data.
sub prep_unimp_posting
{
&slumber;
my $impldiff = shift;
my $language = shift;
my $unimplisting = "";
foreach my $taskname (@$alltasks)
{
# If it's a category task, the task name will be slightly different.
my $baretaskname = $taskname;
$baretaskname =~ s/^Category://;
my $implpage = $taskname;
$implpage =~ s/^Category:(.*)/$1\/$language/;
# my $escapedImplPage = $implpage;
# $escapedImplPage =~ s/ /_/g;
my $link;
$link = "{{unimpl task link|$baretaskname|$language}}";
# $link = "[http://rosettacode.org/mw/index.php?action=edit&title=$escapedImplPage $implpage]";
# Add the task to the unimplemented list, if it's unimplemented, and if it's not in the omit list.
unless(exists $impldiff->{$language}->{'impl'}->{$implpage} or exists $impldiff->{$language}->{'omit'}->{$implpage})
{
$unimplisting .= "* $link\n";
}
}
return &build_posting("Template:" . &get_unimpl_template_name($language), $unimplisting);
}
# Prepares the template body for the listings of omitted data for a page.
sub prep_omit_posting
{
&slumber;
my $implediff = shift;
my $language = shift;
my $omitlisting = "";
my $omittemplatename = &get_omit_template_name($language);
foreach my $taskname (@$alltasks)
{
# We want the task name, not the fully-qualified wiki name.
my $baretaskname = $taskname;
$baretaskname =~ s/^Category://;
# Add the task to the omission list, if it's omitted.
my $escapedTaskName = $baretaskname;
$escapedTaskName =~ s/ /_/g;
$omitlisting .= "* [http://rosettacode.org/mw/index.php?action=edit&title=$escapedTaskName $baretaskname]\n"
if(exists $impldiff{$language}->{'omit'}->{$taskname})
}
# Note that there's no data in the template.
$implediff->{$language}->{'omit'} = 0
if( "" eq $omitlisting );
return &build_posting("Template:$omittemplatename", $omitlisting);
}
# Prepares the page body that folks look at to find out what changed for a language.
sub prep_listing_posting
{
&slumber;
my $impldiff = shift;
my $language = shift;
my $targetcount = $impldiff->{$language}->{'target_count'};
my $unimpcount = $impldiff->{$language}->{'unimp_count'};
# Prepare template fields
my $langfield = "|$language";
my $unimpfield = "|$unimpcount";
my $tcfield = "|$targetcount";
my $impperccalc = 0;
$impperccalc = (($targetcount - $unimpcount) / $targetcount) * 100
unless ($targetcount == 0);
my $imppercfield = sprintf "|%u", $impperccalc;
# Prepare the listing page format.
my $pagedata = '{{unimpl_header' . $langfield . $unimpfield . $tcfield . $imppercfield . '}}';
$pagedata .= "{{" . &get_unimpl_template_name($language) . "$unimpfield}}";
if (0 != $impldiff->{$language}->{'omit'})
{
$pagedata .= "{{omit_header" . "$langfield}}";
$pagedata .= "{{" . &get_omit_template_name($language) ."}}";
}
$pagedata .= "{{unimpl_footer$langfield}}";
return &build_posting(&get_listing_name($language), $pagedata);
}
# Prepare the body of the omit category.
sub prep_omit_cat_posting
{
&slumber;
my $implediff = shift;
my $language = shift;
return &build_posting(&get_omit_cat_name($language), "{{omit_cat|$language}}");
}
# Prepare the page title for the page folks look at to see what change what.
sub get_listing_name
{
&slumber;
my $language = shift;
}
# Prepare the template name for the omit listing body
sub get_omit_template_name
{
&slumber;
my $language = shift;
return "unimp_omit_body_$language";
}
# Prepare the template name for the unimplimended listing body
sub get_unimpl_template_name
{
&slumber;
my $language = shift;
return "unimp_body_$language";
}
# Return the name of the category to find the omitted pages for a particular language.
sub get_omit_cat_name
{
&slumber;
my $language = shift;
return "Category:$language/Omit";
}
# Prepare all the pages associated with a specific language.
sub process_language
{
&slumber;
my $implediff = shift;
my $language = shift;
my $unimpl = &prep_unimp_posting($implediff, $language);
my $omit = &prep_omit_posting($implediff, $language);
my $listing = &prep_listing_posting($implediff, $language);
my %langpostings = ( %$unimpl, %$omit, %$listing );
unless ( exists $createdomitcategories{&get_omit_cat_name($language)} )
{
my $omit_cat = &prep_omit_cat_posting($implediff, $language);
%langpostings = ( %langpostings, %$omit_cat );
}
return \%langpostings;
}
sub wikitxt_pathname {
&slumber;
my $page_name = $_[0];
"test/" . sanitizenamefs($page_name) . ".wikitxt";
}
# Prepare and post all of the core pages for this run.
sub processimplediff
{
&slumber;
my $implediff = shift;
my %work;
# Prepare all our work.
foreach my $language (keys %$implediff)
{
# &out("Preparing data for:$language\n", 4);
my $workitem = &process_language($implediff, $language);
%work = ( %work, %$workitem );
}
# Now that we've prepared all our work, commit it.
foreach my $pagename (keys %work)
{
&postpage($pagename, $work{$pagename}, "Updating $pagename", 0);
}
}
# Return the Mediawiki editor object.
sub geteditor
{
&slumber;
# If we're not posting, and we're only drawing from cache
# We don't actually need to pull from the wiki.
if( exists $options{'cacheonly'} )
{
return undef
unless exists $options{'post'};
}
# Handles interaction with the wiki.
# Note that I had to modify HTTP::Message to make it work
# HTTP::Message silently failed when presented by MW
# with an encoding type of "application/json" or some such.
&out("Creating editor\n", 3);
my $editor = MediaWiki::Bot->new('ImpleSearchBot');
$editor->{debug} = 1;
# Tell the editor to edit Rosetta Code. I'm sure Wikipedia didn't like
# my initial attempts from before I added this line.
&out("Trying to set wiki.\n",3);
$editor->set_wiki('rosettacode.org','mw');
# If we're not posting, we don't need to log in if we're pulling from cache.
# Otherwise, attempt to log in.
&out("Trying to log in.\n", 3);
my $loginres = $editor->login($options{'username'}, $options{'password'});
die "Unable to login: " . $loginres
unless $loginres == 0;
return $editor;
}
# Simple logging infrastructure. Current sends to STDWARN or STDOUT, which cron
# emails to the user, which gets forwarded to Short Circuit. (Anyone want a copy?)
sub out
{
&slumber;
my $string = shift;
my $loglevel = shift;
chomp $string;
push @logoutput, ($string);
if($options{'verbosity'} >= $loglevel)
{
if($loglevel == 1)
{
warn $string; # use stderr.
}
else
{
print "$string\n";
}
}
}
# Many popular filesystems can't andle : and \ in filenames.
# Since I plan to open the SVN repo we save to the rest of the world at some point,
# I'm trying to make sure the files are representable.
sub sanitizenamefs
{
&slumber;
my $pagename = shift;
$pagename =~ tr/:\//__/;
return $pagename;
}
# Find all the entries that are in the second list ref, but not the first.
sub diffcat_simple
{
&slumber;
my $first = shift;
my $second = shift;
my %firsthash = map { $_, 1 } @$first;
my @new = ();
foreach my $secondelement (@$second)
{
push @new, $secondelement
unless exists $firsthash{$secondelement};
}
return @new;
}
# Find all the entries that are in one listref, but not the other.
sub diffcat
{
&slumber;
my $first = shift;
my $second = shift;
my @onlyinfirst = &diffcat_simple($second, $first);
my @onlyinsecond = &diffcat_simple($first, $second);
return (\@onlyinfirst, \@onlyinsecond);
}
sub slurp_file {
&slumber;
my $pathname = $_[0];
open(my $stream, "<", $pathname) or do {warn"Can't open '$pathname': $!\n";
return};
my $ret;
while (1) {
my $len = read($stream, my $buf, 1024);
if (!defined($len)) { die "I/O error while reading '$pathname': $!"; }
elsif ($len == 0) { last }
else { $ret .= $buf }
}
$ret;
}
# Post a page (or save it to disk, if we're testing.)
sub postpage
{
&slumber;
my $pagename = shift;
my $pagedata = shift;
my $remark = shift;
my $minoredit = shift;
# MediaWiki won't let us create blank, empty pages.
# But since we don't want to query (or cache) to see if
# the page already exists, we'll just add an HTML
# non-breaking-space entity if the page is truly empty.
$pagedata = " "
if(0 == length $pagedata);
++$pagesedited;
unless( exists $options{'post'} )
{
$pagename = wikitxt_pathname($pagename);
# save it to disk, and out of the way.
&out("Saving: $pagename\n", 2);
open my $outfile, '>', $pagename
or &out("Failed to open $pagename: $!", 1);
return unless defined $outfile;
print $outfile $pagedata;
close $outfile;
}
else {
&out("Posting $pagename\n", 2);
$editor->edit($pagename, $pagedata, "ImplSearchBot:$remark", $minoredit)
or &out("Failed to post page: " . $editor->{'errstr'}, 1);
}
}
# This allows us to compare site state between now and when we
# last ran, so that we can learn to avoid doing unnecessary work.
# (Saves on server resources and bloated edit statistics...)
# Also, the data gets saved to an SVN repo, so that multiple bots can
# use the history. We're using JSON, as JSON has broader
# cross-language support than Data::Dumper, making it easier for others
# to use the data.
sub cachedata
{
&slumber;
my $dataname = shift;
my $data = shift;
my $filename = $options{'cachepath'} . &sanitizenamefs($dataname . ".json");
unless( exists $options{'post'} )
{
# This is just a test run. Best not to overwrite our cache.
&out("Skipping caching of $dataname to $filename...", 4);
return;
}
my $outfile;
unless(open $outfile, '>', $filename)
{
&out("Failed to open $filename for caching: $!\n", 1);
return;
}
print $outfile to_json($data);
close $outfile;
&out(scalar @$data . " members cached to $filename.\n", 5);
}
# Return data we cached previously.
sub getcacheddata
{
&slumber;
my $dataname = shift;
my $filename = $options{'cachepath'} . &sanitizenamefs($dataname . ".json");
&out("Getting cached data for $dataname from $filename...", 4);
my $infile;
unless (open $infile, '<', $filename)
{
&out("Failed to load cached data $filename: $!\n", 5);
return [];
}
my $jsondata;
$jsondata .= $_ while <$infile>;
close $infile;
my $cacheddata = from_json($jsondata);
&out(scalar @$cacheddata . " cache members retrieved from $filename\n", 5);
++$cachehits;
return $cacheddata;
}
# Report the changes between two categories.
# More interesting than "x added, y removed"
sub reportcatchanges
{
&slumber;
my $category = shift;
my $old = shift;
my $new = shift;
my ($removed, $added) = &diffcat($old, $new);
my $out = "";
$out .= "Removed from $category:\n"
if( scalar @$removed > 0 );
$out .= "$_\n"
foreach (@$removed);
$out .= "Added to $category:\n"
if( scalar @$added > 0 );
$out .= "$_\n"
foreach (@$added);
&out($out, 2)
if("" ne $out);
}
# Pull the category data, or cached data if we're not pulling from the wiki.
sub getcategory
{
&slumber;
my $categoryname = shift;
&out("Getting category contents for $categoryname...", 4);
my $old;
my $new;
if( exists $options{'cacheonly'} )
{
# Return the cache data if we're not supposed to query the database.
$old = [];
$new = &getcacheddata($categoryname);
}
else
{
$old = &getcacheddata($categoryname);
$new = [$editor->get_pages_in_category($categoryname)];
++$categorypulls;
}
# &reportcatchanges($categoryname, $old, $new);
&cachedata("$categoryname", $new);
&out(scalar @$new . " members returned for $categoryname\n", 5);
return $new;
}
# Find if this category changed, report its contents if it has.
sub getwork
{
&slumber;
my $categoryname = shift;
&out("Getting work for $categoryname.\n",4);
my $cacheddata = &getcacheddata($categoryname);
# If we're on a cache-only basis, we'll just say we have no old data,
# and that our cached data is our new data.
my $olddata;
my $newdata;
if (exists $options{'cacheonly'})
{
$olddata = [];
$newdata = $cacheddata;
}
else
{
$olddata = $cacheddata;
$newdata = &getcategory($categoryname);
}
my ($removed, $added) = &diffcat($olddata, $newdata);
my $impl_count_change = scalar @$removed + @$added;
my $work = [];
$work = $newdata
if(($impl_count_change + $task_count_change) > 0);
&out(scalar @$work . " items to process for $categoryname.\n", 3);
return @$work;
}
# If changes occurred, the info is more important than if they didn't.
sub getloglevelfromdiff
{
&slumber;
my $base = shift;
my $first = shift;
my $second = shift;
return $base
if((scalar @$first + scalar @$second) > 0);
return $base + 1;
}
# Find all the work items for a given language.
sub getlangwork
{
&slumber;
my $language = shift;
&out("Getting lang work for $language\n", 4);
my %impl;
foreach my $workitem (&getwork("Category:$language"))
{
$impl{$workitem} = 1;
}
my %omit;
foreach my $workitem (&getwork("Category:$language/Omit"))
{
$omit{$workitem} = 1;
}
# No work to do?
return undef
if( ( 0 == scalar keys %omit) && (0 == scalar keys %impl));
# Language metadata
my $unimpcount = scalar @$alltasks - scalar keys %impl;
my $targetcount = (scalar @$alltasks - scalar keys %omit);
# work to do.
return {'impl' => \%impl,
'omit' => \%omit,
'unimp_count' => $unimpcount,
'target_count' => $targetcount };
}
# Commit the cache.
sub commitcache
{
&slumber;
# First, find out if we've added any files.
my $cachepath = $options{'cachepath'};
opendir(CACHEDIR, $cachepath);
my @current = readdir(CACHEDIR);
close(CACHEDIR);
# We need to run svn adds if we've created any new files.
# Maybe we'll use SVN::Client some day. Not right now.
my ($added, $removed) = &diffcat(\@current, \@initialcache);
&out("Detected " . scalar @$added . " new cache files and " . scalar @$removed . " removed\n",&getloglevelfromdiff(2, $added, $removed));
if ((scalar @$added + scalar @$removed ) > 0)
{
foreach my $cachefile (@$added)
{
&svn('add', $cachepath . $cachefile);;
}
foreach my $cachefile (@$removed)
{
&svn('remove', $cachepath . $cachefile);;
}
}
&svn('ci', '--message="ImplSearchBot run"', $cachepath);
&svn('update', $cachepath);
}
# Wrap svn commands so we can log them.
sub svn
{
&slumber;
return
if(exists $options{'nosvn'});
my @args = @_;
my $string = "system 'svn'";
$string .= ", '$_'"
foreach (@args);
$string .= "\n";
&out($string, 3);
system 'svn', @args
if(exists $options{'post'});
}
sub getopt {
&slumber;
#I don't care to pollute my global scope.
my $wiki = 'rosettacode.org';
my $username;
my $password;
my $verbosity = 2; # verbosity level. 0 is silent. 1 is error only. 2 is updates. 3 is process, more is noisy.
my $post; # Is this an actual run?
my $cachepath = "/tmp/";
my $cacheonly; # Don't query the wiki for data. Just pull from cache.
my $nosvn;
my $rebuild_all;
my $opt_matrix = {
"wiki=s" => \$wiki,
"username=s" => \$username,
"password=s" => \$password,
"verbosity=s" => \$verbosity,
"post" => \$post,
"cacheonly" => \$cacheonly,
"nosvn" => \$nosvn,
"cachepath=s" => \$cachepath,
"rebuildall" => \$rebuild_all };
my $result = GetOptions( %$opt_matrix );
$options{'wiki'} = $wiki;
$options{'nosvn'} = $nosvn
if defined $nosvn;
$options{'username'} = $username
if defined $username;
$options{'password'} = $password
if defined $password;
$options{'post'} = $post
if defined $post;
$options{'verbosity'} = $verbosity;
$cachepath .= '/'
if('/' ne substr($cachepath, -1, 1));
$options{'cachepath'} = $cachepath;
$options{'cacheonly'} = $cacheonly
if defined $cacheonly;
$options{'rebuild_all'} = $rebuild_all
if defined $rebuild_all;
my $usage = "Usage: $0 (options)\n The available options are:\n";
$usage .= "\t--$_\n"
foreach (keys %$opt_matrix);
$usage .= "Username and password are required if you need to pull data from the wiki. Wiki defaults to Rosetta Code.\n";
my $wikineeded;
$wikineeded = "yes"
unless ( exists $options{'cacheonly'} );
$wikineeded = "yes"
if ( exists $options{'post'} );
if(defined $wikineeded)
{
unless(exists $options{'username'} and exists $options{'password'})
{
die $usage;
}
}
}
sub main {
&slumber;
unless (@ARGV) { @ARGV = @_; }
getopt();
#Statistic tracking.
# Tracking for svn checkin at end.
# We *should* be the only ones writing to the cache path for now.
# Anyone else should have their own checkout of the data,
# and test runs are done with --cacheonly and --nosvn.
opendir(my $cachedir, $options{'cachepath'})
or die "Unable to open cache directory";
@initialcache = readdir($cachedir);
closedir($cachedir);
&out(scalar @initialcache . " categories initially cached\n", 4);
# Get our editor
$editor = &geteditor();
# Get a complete listing of the tasks.
&out("Getting tasks\n", 3);
$alltasks = &getcategory('Category:Programming Tasks');
# Get a complete listing of the languages.
&out("Getting the languages.\n", 3);
$alllanguages= &getcategory('Category:Programming Languages');
# Quick check. Did we add or lose any tasks? If so, we've got to recalc *all*
# of the pages. :-/
unless(exists $options{'rebuild_all'}) {
my $cacheddata = &getcacheddata('Category:Programming Tasks');
my ($added, $removed) = &diffcat($alltasks, $cacheddata);
$task_count_change = scalar @$added + scalar @$removed;
}
else {
$task_count_change = 1; # It just needs to be nonzero...
}
# We want the language name, not the fully-qualified wiki name.
$_ =~ s/^Category:// foreach (@$alllanguages);
# Get a list of the languages for which we've already provided bodies for the related omit categories.
# Store it as a hash, so the lookup will be faster.
my $omitcatcontents = &getcategory('Category:Maintenance/OmitCategoriesCreated');
my %createdomitcategories = map {$_, 1} @$omitcatcontents;
&out("Identifying work to do\n", 3);
foreach my $lang (@$alllanguages) {
my $val = &getlangwork($lang);
next unless defined $val;
$impldiff{$lang} = $val;
}
&processimplediff(\%impldiff);
my $runtime = time() - $starttime;
my $statsdata = "Pages edited last run: $pagesedited<br/>Time to post all per-language updates: $runtime seconds<br/>Category pulls: $categorypulls<br/>Cache hits: $cachehits<br/>";
&out("Updating stats page. Runtime ($runtime), Pages edited ($pagesedited)\n", 2);
&postpage("User:ImplSearchBot/Stats", $statsdata, "Updating stats data", 0);
&out("Updating bot code page\n", 4);
open my $sourcefile, '<', $0
or die "Finished without updating bot source page";
my $botsource;
$botsource .= $_ while <$sourcefile>;
close $sourcefile;
my $tag = "lang";
&postpage("User:ImplSearchBot/Code", "<$tag perl>$botsource</$tag>", 0);
my $logdata = "<pre>" . join( "\n", @logoutput) . "</pre>";
&postpage("User:ImplSearchBot/Log", $logdata, 0);
&out("Updating cache\n", 4);
&commitcache();
&out("Done\n", 3);
exit 0
}
if (!caller) { exit main }
</lang>
|
Latest revision as of 21:16, 26 August 2009
<lang perl>#!/usr/bin/perl -w use strict; use Getopt::Long; use MediaWiki::Bot; use JSON qw/to_json from_json/;
our $editor; our $starttime = time; our $pagesedited = 0; our $categorypulls = 0; our $cachehits = 0; our $task_count_change; our $alltasks; our %impldiff; our %createdomitcategories; our @initialcache; our $alllanguages; our @logoutput;
my %options;
- ---------------------
- These are all the functions that break up our work
- into logical chunks.
- ---------------------
- Called in a *lot* of places to check the machine for overload.
sub slumber { # Snooze. while(&snooze) {}; }
- A reversed snooze buton
sub snooze { open my $loadavg, '<', '/proc/loadavg' or die "Failed to check load average: $!"; my $loadstr = <$loadavg>; close $loadavg;
# Wait one minute for every process in the wait queue. my @avgs = split / +/, $loadstr, 3;
my $sleeptime = 0;
# Sleep at least one minute for each process in the wait queue. $sleeptime = $avgs[0] * 60;
# If we're currently under notable load
if($sleeptime > 30)
{
print "Calculated sleeptime: $sleeptime\n";
print "Load averages: " . join(' ', @avgs) . " ... Sleeping $sleeptime seconds\n";
sleep $sleeptime;
return $sleeptime;
}
return 0; }
- Builds a simple hash ref associating a page name with body.
- Used to help us prepare our postings in one pass, then commit them
- in a second pass.
sub build_posting { &slumber;
my $name = shift; my $body = shift; return {$name => $body};
}
- Prepares the template body for the unimplemented data.
sub prep_unimp_posting { &slumber;
my $impldiff = shift; my $language = shift; my $unimplisting = ""; foreach my $taskname (@$alltasks) { # If it's a category task, the task name will be slightly different. my $baretaskname = $taskname; $baretaskname =~ s/^Category://; my $implpage = $taskname; $implpage =~ s/^Category:(.*)/$1\/$language/;
# my $escapedImplPage = $implpage;
# $escapedImplPage =~ s/ /_/g; my $link;
$link = "Template:Unimpl task link";
- $link = "$implpage";
# Add the task to the unimplemented list, if it's unimplemented, and if it's not in the omit list. unless(exists $impldiff->{$language}->{'impl'}->{$implpage} or exists $impldiff->{$language}->{'omit'}->{$implpage}) { $unimplisting .= "* $link\n"; } } return &build_posting("Template:" . &get_unimpl_template_name($language), $unimplisting);
}
- Prepares the template body for the listings of omitted data for a page.
sub prep_omit_posting { &slumber;
my $implediff = shift; my $language = shift; my $omitlisting = ""; my $omittemplatename = &get_omit_template_name($language); foreach my $taskname (@$alltasks) { # We want the task name, not the fully-qualified wiki name. my $baretaskname = $taskname; $baretaskname =~ s/^Category://; # Add the task to the omission list, if it's omitted. my $escapedTaskName = $baretaskname; $escapedTaskName =~ s/ /_/g; $omitlisting .= "* $baretaskname\n" if(exists $impldiff{$language}->{'omit'}->{$taskname}) } # Note that there's no data in the template. $implediff->{$language}->{'omit'} = 0 if( "" eq $omitlisting ); return &build_posting("Template:$omittemplatename", $omitlisting);
}
- Prepares the page body that folks look at to find out what changed for a language.
sub prep_listing_posting { &slumber;
my $impldiff = shift; my $language = shift; my $targetcount = $impldiff->{$language}->{'target_count'}; my $unimpcount = $impldiff->{$language}->{'unimp_count'}; # Prepare template fields my $langfield = "|$language"; my $unimpfield = "|$unimpcount"; my $tcfield = "|$targetcount"; my $impperccalc = 0; $impperccalc = (($targetcount - $unimpcount) / $targetcount) * 100
unless ($targetcount == 0);
my $imppercfield = sprintf "|%u", $impperccalc; # Prepare the listing page format. my $pagedata = 'Template:Unimpl header' . $langfield . $unimpfield . $tcfield . $imppercfield . ''; $pagedata .= "Template:" . &get unimpl template name($language) . "$unimpfield"; if (0 != $impldiff->{$language}->{'omit'}) { $pagedata .= "Template:Omit header" . "$langfield"; $pagedata .= "Template:" . &get omit template name($language) .""; } $pagedata .= "Template:Unimpl footer$langfield"; return &build_posting(&get_listing_name($language), $pagedata);
}
- Prepare the body of the omit category.
sub prep_omit_cat_posting { &slumber;
my $implediff = shift; my $language = shift; return &build_posting(&get_omit_cat_name($language), "Tasks listed here have been marked as "un-implementable" in $language. Solutions may be impossible to do, too complex to be of any valuable instruction in $language, or prohibited by the task definition.
But hey; if you think you can prove us wrong, go for it. :-) "); }
- Prepare the page title for the page folks look at to see what change what.
sub get_listing_name { &slumber;
my $language = shift; return "Tasks not implemented in $language";
}
- Prepare the template name for the omit listing body
sub get_omit_template_name { &slumber;
my $language = shift; return "unimp_omit_body_$language";
}
- Prepare the template name for the unimplimended listing body
sub get_unimpl_template_name { &slumber;
my $language = shift; return "unimp_body_$language";
}
- Return the name of the category to find the omitted pages for a particular language.
sub get_omit_cat_name { &slumber;
my $language = shift; return "Category:$language/Omit";
}
- Prepare all the pages associated with a specific language.
sub process_language { &slumber;
my $implediff = shift; my $language = shift; my $unimpl = &prep_unimp_posting($implediff, $language); my $omit = &prep_omit_posting($implediff, $language); my $listing = &prep_listing_posting($implediff, $language); my %langpostings = ( %$unimpl, %$omit, %$listing ); unless ( exists $createdomitcategories{&get_omit_cat_name($language)} ) { my $omit_cat = &prep_omit_cat_posting($implediff, $language); %langpostings = ( %langpostings, %$omit_cat ); } return \%langpostings;
}
sub wikitxt_pathname { &slumber;
my $page_name = $_[0]; "test/" . sanitizenamefs($page_name) . ".wikitxt";
}
- Prepare and post all of the core pages for this run.
sub processimplediff { &slumber;
my $implediff = shift; my %work; # Prepare all our work. foreach my $language (keys %$implediff) { # &out("Preparing data for:$language\n", 4); my $workitem = &process_language($implediff, $language); %work = ( %work, %$workitem ); } # Now that we've prepared all our work, commit it. foreach my $pagename (keys %work) { &postpage($pagename, $work{$pagename}, "Updating $pagename", 0); }
}
- Return the Mediawiki editor object.
sub geteditor { &slumber;
# If we're not posting, and we're only drawing from cache # We don't actually need to pull from the wiki. if( exists $options{'cacheonly'} ) { return undef unless exists $options{'post'}; } # Handles interaction with the wiki. # Note that I had to modify HTTP::Message to make it work # HTTP::Message silently failed when presented by MW # with an encoding type of "application/json" or some such. &out("Creating editor\n", 3); my $editor = MediaWiki::Bot->new('ImpleSearchBot'); $editor->{debug} = 1; # Tell the editor to edit Rosetta Code. I'm sure Wikipedia didn't like # my initial attempts from before I added this line. &out("Trying to set wiki.\n",3); $editor->set_wiki('rosettacode.org','mw'); # If we're not posting, we don't need to log in if we're pulling from cache. # Otherwise, attempt to log in. &out("Trying to log in.\n", 3); my $loginres = $editor->login($options{'username'}, $options{'password'}); die "Unable to login: " . $loginres unless $loginres == 0; return $editor;
}
- Simple logging infrastructure. Current sends to STDWARN or STDOUT, which cron
- emails to the user, which gets forwarded to Short Circuit. (Anyone want a copy?)
sub out { &slumber;
my $string = shift; my $loglevel = shift; chomp $string;
push @logoutput, ($string); if($options{'verbosity'} >= $loglevel) { if($loglevel == 1) { warn $string; # use stderr. } else { print "$string\n"; } }
}
- Many popular filesystems can't andle : and \ in filenames.
- Since I plan to open the SVN repo we save to the rest of the world at some point,
- I'm trying to make sure the files are representable.
sub sanitizenamefs { &slumber;
my $pagename = shift; $pagename =~ tr/:\//__/; return $pagename;
}
- Find all the entries that are in the second list ref, but not the first.
sub diffcat_simple { &slumber;
my $first = shift; my $second = shift; my %firsthash = map { $_, 1 } @$first; my @new = (); foreach my $secondelement (@$second) { push @new, $secondelement unless exists $firsthash{$secondelement}; } return @new;
}
- Find all the entries that are in one listref, but not the other.
sub diffcat { &slumber;
my $first = shift; my $second = shift; my @onlyinfirst = &diffcat_simple($second, $first); my @onlyinsecond = &diffcat_simple($first, $second); return (\@onlyinfirst, \@onlyinsecond);
}
sub slurp_file { &slumber;
my $pathname = $_[0]; open(my $stream, "<", $pathname) or do {warn"Can't open '$pathname': $!\n"; return}; my $ret; while (1) { my $len = read($stream, my $buf, 1024); if (!defined($len)) { die "I/O error while reading '$pathname': $!"; } elsif ($len == 0) { last } else { $ret .= $buf } } $ret;
}
- Post a page (or save it to disk, if we're testing.)
sub postpage { &slumber;
my $pagename = shift; my $pagedata = shift; my $remark = shift; my $minoredit = shift; # MediaWiki won't let us create blank, empty pages. # But since we don't want to query (or cache) to see if # the page already exists, we'll just add an HTML # non-breaking-space entity if the page is truly empty. $pagedata = " " if(0 == length $pagedata); ++$pagesedited; unless( exists $options{'post'} ) { $pagename = wikitxt_pathname($pagename); # save it to disk, and out of the way. &out("Saving: $pagename\n", 2); open my $outfile, '>', $pagename or &out("Failed to open $pagename: $!", 1); return unless defined $outfile; print $outfile $pagedata; close $outfile; } else {
&out("Posting $pagename\n", 2); $editor->edit($pagename, $pagedata, "ImplSearchBot:$remark", $minoredit) or &out("Failed to post page: " . $editor->{'errstr'}, 1); }
}
- This allows us to compare site state between now and when we
- last ran, so that we can learn to avoid doing unnecessary work.
- (Saves on server resources and bloated edit statistics...)
- Also, the data gets saved to an SVN repo, so that multiple bots can
- use the history. We're using JSON, as JSON has broader
- cross-language support than Data::Dumper, making it easier for others
- to use the data.
sub cachedata { &slumber; my $dataname = shift; my $data = shift; my $filename = $options{'cachepath'} . &sanitizenamefs($dataname . ".json"); unless( exists $options{'post'} ) { # This is just a test run. Best not to overwrite our cache. &out("Skipping caching of $dataname to $filename...", 4); return; } my $outfile; unless(open $outfile, '>', $filename) { &out("Failed to open $filename for caching: $!\n", 1); return; }
print $outfile to_json($data); close $outfile; &out(scalar @$data . " members cached to $filename.\n", 5); }
- Return data we cached previously.
sub getcacheddata { &slumber; my $dataname = shift; my $filename = $options{'cachepath'} . &sanitizenamefs($dataname . ".json"); &out("Getting cached data for $dataname from $filename...", 4); my $infile; unless (open $infile, '<', $filename) { &out("Failed to load cached data $filename: $!\n", 5); return []; }
my $jsondata; $jsondata .= $_ while <$infile>; close $infile;
my $cacheddata = from_json($jsondata); &out(scalar @$cacheddata . " cache members retrieved from $filename\n", 5);
++$cachehits; return $cacheddata; }
- Report the changes between two categories.
- More interesting than "x added, y removed"
sub reportcatchanges { &slumber; my $category = shift; my $old = shift; my $new = shift;
my ($removed, $added) = &diffcat($old, $new);
my $out = "";
$out .= "Removed from $category:\n" if( scalar @$removed > 0 ); $out .= "$_\n" foreach (@$removed);
$out .= "Added to $category:\n" if( scalar @$added > 0 ); $out .= "$_\n" foreach (@$added);
&out($out, 2) if("" ne $out); }
- Pull the category data, or cached data if we're not pulling from the wiki.
sub getcategory { &slumber; my $categoryname = shift; &out("Getting category contents for $categoryname...", 4);
my $old; my $new;
if( exists $options{'cacheonly'} ) { # Return the cache data if we're not supposed to query the database. $old = []; $new = &getcacheddata($categoryname); } else { $old = &getcacheddata($categoryname); $new = [$editor->get_pages_in_category($categoryname)]; ++$categorypulls; }
# &reportcatchanges($categoryname, $old, $new);
&cachedata("$categoryname", $new);
&out(scalar @$new . " members returned for $categoryname\n", 5);
return $new; }
- Find if this category changed, report its contents if it has.
sub getwork { &slumber; my $categoryname = shift; &out("Getting work for $categoryname.\n",4); my $cacheddata = &getcacheddata($categoryname);
# If we're on a cache-only basis, we'll just say we have no old data, # and that our cached data is our new data. my $olddata; my $newdata; if (exists $options{'cacheonly'}) { $olddata = []; $newdata = $cacheddata; } else { $olddata = $cacheddata; $newdata = &getcategory($categoryname); }
my ($removed, $added) = &diffcat($olddata, $newdata); my $impl_count_change = scalar @$removed + @$added; my $work = []; $work = $newdata if(($impl_count_change + $task_count_change) > 0);
&out(scalar @$work . " items to process for $categoryname.\n", 3);
return @$work; }
- If changes occurred, the info is more important than if they didn't.
sub getloglevelfromdiff { &slumber; my $base = shift; my $first = shift; my $second = shift; return $base if((scalar @$first + scalar @$second) > 0); return $base + 1; }
- Find all the work items for a given language.
sub getlangwork { &slumber; my $language = shift; &out("Getting lang work for $language\n", 4); my %impl; foreach my $workitem (&getwork("Category:$language")) { $impl{$workitem} = 1; } my %omit; foreach my $workitem (&getwork("Category:$language/Omit")) { $omit{$workitem} = 1; }
# No work to do? return undef if( ( 0 == scalar keys %omit) && (0 == scalar keys %impl));
# Language metadata my $unimpcount = scalar @$alltasks - scalar keys %impl; my $targetcount = (scalar @$alltasks - scalar keys %omit);
# work to do.
return {'impl' => \%impl,
'omit' => \%omit,
'unimp_count' => $unimpcount,
'target_count' => $targetcount }; }
- Commit the cache.
sub commitcache { &slumber; # First, find out if we've added any files. my $cachepath = $options{'cachepath'}; opendir(CACHEDIR, $cachepath);
my @current = readdir(CACHEDIR); close(CACHEDIR);
# We need to run svn adds if we've created any new files. # Maybe we'll use SVN::Client some day. Not right now.
my ($added, $removed) = &diffcat(\@current, \@initialcache); &out("Detected " . scalar @$added . " new cache files and " . scalar @$removed . " removed\n",&getloglevelfromdiff(2, $added, $removed));
if ((scalar @$added + scalar @$removed ) > 0) { foreach my $cachefile (@$added) { &svn('add', $cachepath . $cachefile);; }
foreach my $cachefile (@$removed) { &svn('remove', $cachepath . $cachefile);; } }
&svn('ci', '--message="ImplSearchBot run"', $cachepath); &svn('update', $cachepath); }
- Wrap svn commands so we can log them.
sub svn { &slumber; return if(exists $options{'nosvn'}); my @args = @_;
my $string = "system 'svn'"; $string .= ", '$_'" foreach (@args);
$string .= "\n";
&out($string, 3);
system 'svn', @args if(exists $options{'post'}); }
sub getopt { &slumber;
#I don't care to pollute my global scope. my $wiki = 'rosettacode.org'; my $username; my $password; my $verbosity = 2; # verbosity level. 0 is silent. 1 is error only. 2 is updates. 3 is process, more is noisy. my $post; # Is this an actual run? my $cachepath = "/tmp/"; my $cacheonly; # Don't query the wiki for data. Just pull from cache. my $nosvn; my $rebuild_all; my $opt_matrix = { "wiki=s" => \$wiki, "username=s" => \$username, "password=s" => \$password, "verbosity=s" => \$verbosity, "post" => \$post, "cacheonly" => \$cacheonly, "nosvn" => \$nosvn, "cachepath=s" => \$cachepath, "rebuildall" => \$rebuild_all }; my $result = GetOptions( %$opt_matrix ); $options{'wiki'} = $wiki; $options{'nosvn'} = $nosvn if defined $nosvn; $options{'username'} = $username if defined $username; $options{'password'} = $password if defined $password; $options{'post'} = $post if defined $post; $options{'verbosity'} = $verbosity; $cachepath .= '/' if('/' ne substr($cachepath, -1, 1)); $options{'cachepath'} = $cachepath; $options{'cacheonly'} = $cacheonly if defined $cacheonly; $options{'rebuild_all'} = $rebuild_all if defined $rebuild_all; my $usage = "Usage: $0 (options)\n The available options are:\n"; $usage .= "\t--$_\n" foreach (keys %$opt_matrix); $usage .= "Username and password are required if you need to pull data from the wiki. Wiki defaults to Rosetta Code.\n"; my $wikineeded; $wikineeded = "yes" unless ( exists $options{'cacheonly'} ); $wikineeded = "yes" if ( exists $options{'post'} ); if(defined $wikineeded) { unless(exists $options{'username'} and exists $options{'password'}) { die $usage; } }
}
sub main { &slumber;
unless (@ARGV) { @ARGV = @_; }
getopt();
- Statistic tracking.
- Tracking for svn checkin at end.
- We *should* be the only ones writing to the cache path for now.
- Anyone else should have their own checkout of the data,
- and test runs are done with --cacheonly and --nosvn.
opendir(my $cachedir, $options{'cachepath'}) or die "Unable to open cache directory"; @initialcache = readdir($cachedir); closedir($cachedir); &out(scalar @initialcache . " categories initially cached\n", 4);
- Get our editor
$editor = &geteditor();
- Get a complete listing of the tasks.
&out("Getting tasks\n", 3); $alltasks = &getcategory('Category:Programming Tasks');
- Get a complete listing of the languages.
&out("Getting the languages.\n", 3); $alllanguages= &getcategory('Category:Programming Languages');
- Quick check. Did we add or lose any tasks? If so, we've got to recalc *all*
- of the pages. :-/
unless(exists $options{'rebuild_all'}) { my $cacheddata = &getcacheddata('Category:Programming Tasks'); my ($added, $removed) = &diffcat($alltasks, $cacheddata); $task_count_change = scalar @$added + scalar @$removed; } else { $task_count_change = 1; # It just needs to be nonzero... }
- We want the language name, not the fully-qualified wiki name.
$_ =~ s/^Category:// foreach (@$alllanguages);
- Get a list of the languages for which we've already provided bodies for the related omit categories.
- Store it as a hash, so the lookup will be faster.
my $omitcatcontents = &getcategory('Category:Maintenance/OmitCategoriesCreated'); my %createdomitcategories = map {$_, 1} @$omitcatcontents;
&out("Identifying work to do\n", 3);
foreach my $lang (@$alllanguages) { my $val = &getlangwork($lang); next unless defined $val; $impldiff{$lang} = $val; }
&processimplediff(\%impldiff);
my $runtime = time() - $starttime; my $statsdata = "Pages edited last run: $pagesedited
Time to post all per-language updates: $runtime seconds
Category pulls: $categorypulls
Cache hits: $cachehits
"; &out("Updating stats page. Runtime ($runtime), Pages edited ($pagesedited)\n", 2);
&postpage("User:ImplSearchBot/Stats", $statsdata, "Updating stats data", 0);
&out("Updating bot code page\n", 4);
open my $sourcefile, '<', $0 or die "Finished without updating bot source page";
my $botsource; $botsource .= $_ while <$sourcefile>;
close $sourcefile;
my $tag = "lang";
&postpage("User:ImplSearchBot/Code", "<$tag perl>$botsource</$tag>", 0);
my $logdata = "
" . join( "\n", @logoutput) . "
";
&postpage("User:ImplSearchBot/Log", $logdata, 0);
&out("Updating cache\n", 4); &commitcache();
&out("Done\n", 3);
exit 0
}
if (!caller) { exit main } </lang>