User:ImplSearchBot/Code: Difference between revisions

From Rosetta Code
Content added Content deleted
m (ImplSearchBot:0)
m (ImplSearchBot:0)
Line 1: Line 1:
<lang perl>#!/usr/bin/perl -w
<lang perl>#!/usr/bin/perl -w
use strict;
use strict;
use Getopt::Long;
use MediaWiki::Bot;
use MediaWiki::Bot;
use JSON qw/to_json from_json/;
use JSON qw/to_json from_json/;


# Handles interaction with the wiki.
# Note that I had to modify HTTP::Message to make it work
# HTTP::Message silently failed when presented by MW
# with an encoding type of "application/json" or some such.


my $usage = "Usage: $0 (username) (password) [posttosite]";
my $usage = "Usage: $0 --username=(username) --password=(password) [--posttosite=yes]";


my %options;
my $username = shift @ARGV;
my $password = shift @ARGV;

defined $username
or die $usage;

defined $password
or die $usage;

#my $json = new JSON;
#$json->allow_blessed(1);
#$json->convert_blessed(1);

my $posttosite = shift @ARGV;
my $starttime = time;

my $pagesedited = 0;


#I don't care to pollute my global scope.
print "Creating editor\n";
my $editor = MediaWiki::Bot->new('ImpleSearchBot');
$editor->{debug} = 1;

sub sanitizenamefs
{
{
my $pagename = shift;
my $wiki = 'rosettacode.org';
my $username;
$pagename =~ tr/:\//__/;
my $password;
return $pagename;
my $verbosity = 2; # verbosity level. 0 is silent. 1 is error only. 2 is updates. 3 is process, more is noisy.
}
my $post; # Is this an actual run?
my $result = GetOptions(
"wiki=s" => \$wiki,
"username=s" => \$username,
"password=s" => \$password,
"verbosity=s" => \$verbosity,
"post" => \$post);
$options{'wiki'} = $wiki;


die $usage
sub postpage
unless defined $username;
{
my $pagename = shift;
my $pagedata = shift;
my $remark = shift;
my $minoredit = shift;


$options{'username'} = $username;
++$pagesedited;


die $usage
unless( defined $posttosite )
unless defined $password;
{
$pagename = &sanitizenamefs($pagename);
$pagename .= ".wikitxt";


$options{'password'} = $password;
print "Saving: $pagename\n";


$options{'post'} = $post
open my $outfile, '>', $pagename
if defined $post;
or warn "Failed to open $pagename: $!";


$options{'verbosity'} = $verbosity;
return unless defined $outfile;

print $outfile $pagedata;
close $outfile;
}
else
{
print "Posting $pagename\n";
$editor->edit($pagename, $pagedata, "ImplSearchBot:$remark", $minoredit)
or warn "Failed to post page: " . $editor->{'errstr'};
}
}
}


sub getcacheddata
{
my $dataname = shift;
my $filename = &sanitizenamefs("cache_" . $dataname . ".json");
open my $infile, '<', $filename
or warn "Failed to load cached data $filename: $!";


#Statistic tracking.
return undef unless defined $infile;
my $starttime = time;

my $jsondata;
my $pagesedited = 0;
$jsondata .= $_ while <$infile>;
close $infile;
return from_json($jsondata);
}

# Not doing anything with this yet. It's intended to allow us to compare site state between now and when we
# last ran, so that we can learn to avoid doing unnecessary work. (Server resources and bloated edit statistics...)
# Also, I plan on publishing the cache files in a version control system like SVN or Git, to allow multiple bots
# to share the data and to provide history. (It's the public target that necessitates using JSON instead of Data::Dumper)
# SVN has the advantage that I already know how to use it. Git has the advantage in that I can use GitHub and not tax
# my Slice with Git traffic, and I don't have to punch a hole in the firewall to access more services.
sub cachedata
{
my $dataname = shift;
my $data = shift;
my $filename = &sanitizenamefs("cache_" . $dataname . ".json");
open my $outfile, '>', $filename
or warn "Failed to cache $filename: $!";
return unless defined $outfile;


# Handles interaction with the wiki.

# Note that I had to modify HTTP::Message to make it work
print "Caching $filename:" . scalar $data . "\n";
# HTTP::Message silently failed when presented by MW
print $outfile to_json($data);
# with an encoding type of "application/json" or some such.
close $outfile;
&out("Creating editor\n", 3);
}
my $editor = MediaWiki::Bot->new('ImpleSearchBot');

$editor->{debug} = 1;
sub getcategory
{
my $categoryname = shift;
my @categorycontents = $editor->get_pages_in_category($categoryname);

&cachedata($categoryname, \@categorycontents);
return @categorycontents;
}


# Tell the editor to edit Rosetta Code. I'm sure Wikipedia didn't like
# Tell the editor to edit Rosetta Code. I'm sure Wikipedia didn't like
# my initial attempts from before I added this line.
# my initial attempts from before I added this line.
print "Trying to set wiki.\n";
&out("Trying to set wiki.\n",3);
$editor->set_wiki('rosettacode.org','w');
$editor->set_wiki('rosettacode.org','w');


# Attempt to log in.
# Attempt to log in.
print "Trying to log in.\n";
&out("Trying to log in.\n", 3);
unless("Success" == $editor->login($username, $password))
my $loginres = $editor->login($options{'username'}, $options{'password'});

{
die "Unable to login: " . $loginres
# No, it's not the "(expr) or die" syntax. This will be clearer
unless $loginres == "Success";
# for most folks who read the code.
die "Unable to login: " . from_json($editor);
}


# Get a complete listing of the tasks.
# Get a complete listing of the tasks.
print "Getting tasks\n";
&out("Getting tasks\n", 3);
my @alltasks = &getcategory('Category:Programming Tasks');
my @alltasks = &getcategory('Category:Programming Tasks');


# Get a complete listing of the languages.
# Get a complete listing of the languages.
print "Getting the languages.\n";
&out("Getting the languages.\n", 3);
my @alllanguages = &getcategory('Category:Programming Languages');
my @alllanguages = &getcategory('Category:Programming Languages');


Line 148: Line 81:
my %createdomitcategories = map {$_, 1} &getcategory('Category:Maintenance/OmitCategoriesCreated');
my %createdomitcategories = map {$_, 1} &getcategory('Category:Maintenance/OmitCategoriesCreated');


print "Identifying implemented and omitted languages\n";
&out("Identifying work to do\n", 3);

foreach my $language (@alllanguages)
my %impldiff;

foreach my $lang (@alllanguages)
{
{
my $val = &getlangwork($lang);
my %implemented = map {$_, 1} &getcategory("Category:$language");
next unless defined $val;
my %omitted = map {$_, 1} &getcategory("Category:$language/Omit");
$impldiff{$lang} = $val;
my $omitcount = scalar keys %omitted;
}


foreach my $language (keys %impldiff)
my $pagename = "Tasks not implemented in $language";
{
print "Preparing data for:$pagename\n";
my $pagename = "Tasks not implemented in $language";
&out("Preparing data for:$pagename\n", 3);
my $hashref = $impldiff{$language}->{'impl'};
my %implemented = %$hashref;
$hashref = $impldiff{$language}->{'omit'};
my %omitted = %$hashref;
my $omitcount = scalar keys %omitted;


# Language metadata
my $taskcount = scalar @alltasks;
my $unimpcount = $taskcount - scalar keys %implemented;
my $targetcount = ($taskcount - $omitcount);


# Language-specific page data.
# Language metadata
my $unimplisting = "";
my $taskcount = scalar @alltasks;
my $unimpcount = $taskcount - scalar keys %implemented;
my $omitlisting = "";
my $targetcount = ($taskcount - $omitcount);
my $pagedata; # Not assembled until the end.


# Language-specific page data.
foreach my $taskname (@alltasks)
my $unimplisting = "";
{
my $omitlisting = "";
# We want the task name, not the fully-qualified wiki name.
my $pagedata; # Not assembled until the end.
my $baretaskname = $taskname;
$baretaskname =~ s/^Category://;


foreach my $taskname (@alltasks)
# Add the task to the unimplemented list, if it's unimplemented.
{
$unimplisting .= "* [[$baretaskname]]\n"
# We want the task name, not the fully-qualified wiki name.
unless(exists $implemented{$taskname});
my $baretaskname = $taskname;
$baretaskname =~ s/^Category://;
# Add the task to the unimplemented list, if it's unimplemented.
$unimplisting .= "* [[$baretaskname]]\n"
unless(exists $implemented{$taskname});


# Add the task to the omission list, if it's omitted.
# Add the task to the omission list, if it's omitted.
$omitlisting .= "* [[$baretaskname]]\n"
$omitlisting .= "* [[$baretaskname]]\n"
if(exists $omitted{$taskname})
if(exists $omitted{$taskname})
}
}


# Prepare template fields
# Prepare template fields
my $langfield = "|$language";
my $langfield = "|$language";
my $unimpfield = "|$unimpcount";
my $unimpfield = "|$unimpcount";
my $tcfield = "|$targetcount";
my $tcfield = "|$targetcount";
my $impperccalc = 0;
$impperccalc = (($targetcount - $unimpcount) / $targetcount) * 100
unless ($targetcount == 0);


my $imppercfield = sprintf "|%u", $impperccalc;
my $impperccalc = 0;
$impperccalc = (($targetcount - $unimpcount) / $targetcount) * 100
unless ($targetcount == 0);
my $imppercfield = sprintf "|%u", $impperccalc;


my $unimpltemplatename = "unimp_body_$language";
my $unimpltemplatename = "unimp_body_$language";
my $omittemplatename = "unimp_omit_body_$language";
my $omittemplatename = "unimp_omit_body_$language";


# Prepare the listing page format.
# Prepare the listing page format.
$pagedata = '{{unimpl_header' . $langfield . $unimpfield . $tcfield . $imppercfield . '}}';
$pagedata = '{{unimpl_header' . $langfield . $unimpfield . $tcfield . $imppercfield . '}}';
$pagedata .= "{{$unimpltemplatename" . "$unimpfield}}";
$pagedata .= "{{$unimpltemplatename" . "$unimpfield}}";
$pagedata .= "{{omit_header" . "$langfield}}";
$pagedata .= "{{omit_header" . "$langfield}}";
$pagedata .= "{{$omittemplatename}}";
$pagedata .= "{{$omittemplatename}}";
$pagedata .= "{{unimpl_footer$langfield}}";
$pagedata .= "{{unimpl_footer$langfield}}";


# Post the template containing the listing of unimplemented tasks.
# Post the template containing the listing of unimplemented tasks.
&postpage("Template:$unimpltemplatename", "<includeonly>$unimplisting</includeonly>", "ImplSearchBot:Updating list body of unimplemented tasks.", 1);
&postpage("Template:$unimpltemplatename", "<includeonly>$unimplisting</includeonly>", "ImplSearchBot:Updating list body of unimplemented tasks.", 1);


# Post the template containing the listing of omitted tasks.
# Post the template containing the listing of omitted tasks.
&postpage("Template:$omittemplatename", "<includeonly>$omitlisting</includeonly>", "ImplSearchBot:Updating list body of unimplemented tasks.", 1);
&postpage("Template:$omittemplatename", "<includeonly>$omitlisting</includeonly>", "ImplSearchBot:Updating list body of unimplemented tasks.", 1);


# I'll uncomment this if the layout has to change significantly again. In the mean time, that's over a hundred pages we don't have to edit...
# I'll uncomment this if the layout has to change significantly again. In the mean time, that's over a hundred pages we don't have to edit...
# # Update the layout of the listing page, because it's changed.
# # Update the layout of the listing page, because it's changed.
# &postpage($pagename, $pagedata, "ImplSearchBot:Updating layout of listing page.",1);
# &postpage($pagename, $pagedata, "ImplSearchBot:Updating layout of listing page.",1);


my $omitcategoryname = "Category:$language/Omit";
my $omitcategoryname = "Category:$language/Omit";
my $omitcatpagedata = "{{omit_cat|$language}}";
my $omitcatpagedata = "{{omit_cat|$language}}";
&postpage($omitcategoryname, $omitcatpagedata, "ImplSearchBot:Updating Omit category body", 0)
&postpage($omitcategoryname, $omitcatpagedata, "ImplSearchBot:Updating Omit category body", 0)
unless ( exists $createdomitcategories{$omitcategoryname} );
unless ( exists $createdomitcategories{$omitcategoryname} );


}
}
Line 228: Line 173:
my $runtime = time() - $starttime;
my $runtime = time() - $starttime;
my $statsdata = "Pages edited last run: $pagesedited\nTime to post all per-language updates: $runtime seconds\n";
my $statsdata = "Pages edited last run: $pagesedited\nTime to post all per-language updates: $runtime seconds\n";
print "Updating stats page. Runtime ($runtime), Pages edited ($pagesedited)\n";
&out("Updating stats page. Runtime ($runtime), Pages edited ($pagesedited)\n", 2);


&postpage("User:ImplSearchBot/Stats", $statsdata, 0);
&postpage("User:ImplSearchBot/Stats", $statsdata, 0);




print "Updating bot code page\n";
&out("Updating bot code page\n", 4);


open my $sourcefile, '<', $0
open my $sourcefile, '<', $0
or die "Finished without updating bot source page";
or die "Finished without updating bot source page";


my $botsource;
my $botsource;
Line 247: Line 192:
&postpage("User:ImplSearchBot/Code", "<$tag perl>$botsource</$tag>", 0);
&postpage("User:ImplSearchBot/Code", "<$tag perl>$botsource</$tag>", 0);


print "Done\n";
&out("Done\n", 2);

exit(0);

sub out
{
my $string = shift;
my $loglevel = shift;

chomp $string;

if($options{'verbosity'} >= $loglevel)
{
if($loglevel == 1)
{
warn $string; # use stderr.
}
else
{
print "$string\n";
}
}
}

sub sanitizenamefs
{
my $pagename = shift;
$pagename =~ tr/:\//__/;
return $pagename;
}

sub diffcat_simple
{
my $first = shift;
my $second = shift;
my %firsthash = map { $_, 1 } @$first;

my @new = ();

foreach my $secondelement (@$second)
{
push @new, $secondelement
unless exists $firsthash{$secondelement};
}

return @new;
}

sub diffcat
{
my $first = shift;
my $second = shift;
my @newinfirst = &diffcat_simple($second, $first);
my @newinsecond = &diffcat_simple($first, $second);
return (\@newinfirst, \@newinsecond);
}

sub postpage
{
my $pagename = shift;
my $pagedata = shift;
my $remark = shift;
my $minoredit = shift;

++$pagesedited;

unless( exists $options{'post'} )
{
$pagename = &sanitizenamefs($pagename);
$pagename .= ".wikitxt";

&out("Saving: $pagename\n", 3);
open my $outfile, '>', $pagename
or &log("Failed to open $pagename: $!", 1);

return unless defined $outfile;

print $outfile $pagedata;
close $outfile;
}
else
{
&out("Posting $pagename\n", 3);

$editor->edit($pagename, $pagedata, "ImplSearchBot:$remark", $minoredit)
or &out("Failed to post page: " . $editor->{'errstr'}, 1);
}
}

# This allows us to compare site state between now and when we
# last ran, so that we can learn to avoid doing unnecessary work.
# (Saves on server resources and bloated edit statistics...)
# Also, I plan on publishing the cache files in a version control
# system like SVN or Git, to allow multiple bots
# to share the data and to provide history. We're using JSON
# Instead of Perl's native Data::Dumper as JSON has broader
# cross-language support, making it easier for others to use the data.
# SVN has the advantage that I already know how to use it.
# Git has the advantage in that I can use GitHub and not tax
# my Slice with Git traffic, and I don't have to punch a hole
# in the firewall to access more services.
sub cachedata
{
my $dataname = shift;
my $data = shift;
&out("Caching $dataname...", 3);
my $filename = &sanitizenamefs("cache_" . $dataname . ".json");
my $outfile;
unless(open $outfile, '>', $filename)
{
&out("Failed to open $filename: $!\n", 1);
return;
}

print $outfile to_json($data);
close $outfile;
&out(scalar $data . " members cached.\n", 4);
}

sub getcacheddata
{
my $dataname = shift;
&out("Getting cached data for $dataname...", 3);
my $filename = &sanitizenamefs("cache_" . $dataname . ".json");
my $infile;
unless (open $infile, '<', $filename)
{
&out("Failed to load cached data $filename: $!\n", 1);
return [];
}

my $jsondata;
$jsondata .= $_ while <$infile>;
close $infile;

my $cacheddata = from_json($jsondata);
&out(scalar @$cacheddata . " members retrieved\n", 4);

return $cacheddata;
}

sub getcategory
{
my $categoryname = shift;
&out("Getting category contents for $categoryname...", 3);
my @categorycontents = $editor->get_pages_in_category($categoryname);
&out(scalar @categorycontents . " members retrieved\n", 3);

my ($removed, $added) = &diffcat(&getcacheddata($categoryname), \@categorycontents);

&out(scalar @$removed . " removed, " . @$added . " added to $categoryname\n", (scalar @$removed + scalar @$added) > 0 ? 2 : 3);

&cachedata("$categoryname", \@categorycontents);

return @categorycontents;
}

sub getwork
{
my $categoryname = shift;
&out("Getting work for $categoryname.\n",3);
my $olddata = &getcacheddata($categoryname);
my @newdata = &getcategory($categoryname);
my ($removed, $added) = &diffcat($olddata, \@newdata);
my @work = ();
@work = @newdata
if((scalar @$removed + scalar @$added) > 0);
&out(scalar @work . " items to process for $categoryname.\n", 3);

return @work;
}

sub getlangwork
{
my $language = shift;
&out("Getting lang work for $language\n", 3);
my %impl;
foreach my $workitem (&getwork("Category:$language"))
{
$impl{$workitem} = 1;
}
my %omit;
foreach my $workitem (&getwork("Category:$language/Omit"))
{
$omit{$workitem} = 1;
}

# No work to do?
return undef
if( ( 0 == scalar keys %omit) && (0 == scalar keys %impl));

# work to do.
return {'impl' => \%impl,
'omit' => \%omit };
}
</lang>
</lang>

Revision as of 07:10, 12 March 2009

<lang perl>#!/usr/bin/perl -w use strict; use Getopt::Long; use MediaWiki::Bot; use JSON qw/to_json from_json/;


my $usage = "Usage: $0 --username=(username) --password=(password) [--posttosite=yes]";

my %options;

  1. I don't care to pollute my global scope.

{ my $wiki = 'rosettacode.org'; my $username; my $password; my $verbosity = 2; # verbosity level. 0 is silent. 1 is error only. 2 is updates. 3 is process, more is noisy. my $post; # Is this an actual run? my $result = GetOptions( "wiki=s" => \$wiki, "username=s" => \$username, "password=s" => \$password, "verbosity=s" => \$verbosity, "post" => \$post); $options{'wiki'} = $wiki;

die $usage unless defined $username;

$options{'username'} = $username;

die $usage unless defined $password;

$options{'password'} = $password;

$options{'post'} = $post if defined $post;

$options{'verbosity'} = $verbosity; }


  1. Statistic tracking.

my $starttime = time; my $pagesedited = 0;

  1. Handles interaction with the wiki.
  2. Note that I had to modify HTTP::Message to make it work
  3. HTTP::Message silently failed when presented by MW
  4. with an encoding type of "application/json" or some such.

&out("Creating editor\n", 3); my $editor = MediaWiki::Bot->new('ImpleSearchBot'); $editor->{debug} = 1;

  1. Tell the editor to edit Rosetta Code. I'm sure Wikipedia didn't like
  2. my initial attempts from before I added this line.

&out("Trying to set wiki.\n",3); $editor->set_wiki('rosettacode.org','w');

  1. Attempt to log in.

&out("Trying to log in.\n", 3); my $loginres = $editor->login($options{'username'}, $options{'password'});

die "Unable to login: " . $loginres unless $loginres == "Success";

  1. Get a complete listing of the tasks.

&out("Getting tasks\n", 3); my @alltasks = &getcategory('Category:Programming Tasks');

  1. Get a complete listing of the languages.

&out("Getting the languages.\n", 3); my @alllanguages = &getcategory('Category:Programming Languages');

  1. We want the language name, not the fully-qualified wiki name.

$_ =~ s/^Category:// foreach (@alllanguages);

  1. Get a list of the languages for which we've already provided bodies for the related omit categories.
  2. Store it as a hash, so the lookup will be faster.

my %createdomitcategories = map {$_, 1} &getcategory('Category:Maintenance/OmitCategoriesCreated');

&out("Identifying work to do\n", 3);

my %impldiff;

foreach my $lang (@alllanguages) { my $val = &getlangwork($lang); next unless defined $val; $impldiff{$lang} = $val; }

foreach my $language (keys %impldiff) { my $pagename = "Tasks not implemented in $language"; &out("Preparing data for:$pagename\n", 3); my $hashref = $impldiff{$language}->{'impl'}; my %implemented = %$hashref; $hashref = $impldiff{$language}->{'omit'}; my %omitted = %$hashref; my $omitcount = scalar keys %omitted;


# Language metadata my $taskcount = scalar @alltasks; my $unimpcount = $taskcount - scalar keys %implemented; my $targetcount = ($taskcount - $omitcount);

# Language-specific page data. my $unimplisting = ""; my $omitlisting = ""; my $pagedata; # Not assembled until the end.

foreach my $taskname (@alltasks) { # We want the task name, not the fully-qualified wiki name. my $baretaskname = $taskname; $baretaskname =~ s/^Category://;

# Add the task to the unimplemented list, if it's unimplemented. $unimplisting .= "* $baretaskname\n" unless(exists $implemented{$taskname});

# Add the task to the omission list, if it's omitted. $omitlisting .= "* $baretaskname\n" if(exists $omitted{$taskname}) }

# Prepare template fields my $langfield = "|$language"; my $unimpfield = "|$unimpcount"; my $tcfield = "|$targetcount";

my $impperccalc = 0; $impperccalc = (($targetcount - $unimpcount) / $targetcount) * 100 unless ($targetcount == 0);

my $imppercfield = sprintf "|%u", $impperccalc;

my $unimpltemplatename = "unimp_body_$language"; my $omittemplatename = "unimp_omit_body_$language";

# Prepare the listing page format. $pagedata = 'Template:Unimpl header' . $langfield . $unimpfield . $tcfield . $imppercfield . ''; $pagedata .= "Template:$unimpltemplatename" . "$unimpfield"; $pagedata .= "Template:Omit header" . "$langfield"; $pagedata .= "Template:$omittemplatename"; $pagedata .= "Template:Unimpl footer$langfield";

# Post the template containing the listing of unimplemented tasks. &postpage("Template:$unimpltemplatename", "", "ImplSearchBot:Updating list body of unimplemented tasks.", 1);

# Post the template containing the listing of omitted tasks. &postpage("Template:$omittemplatename", "", "ImplSearchBot:Updating list body of unimplemented tasks.", 1);

# I'll uncomment this if the layout has to change significantly again. In the mean time, that's over a hundred pages we don't have to edit... # # Update the layout of the listing page, because it's changed. # &postpage($pagename, $pagedata, "ImplSearchBot:Updating layout of listing page.",1);

my $omitcategoryname = "Category:$language/Omit"; my $omitcatpagedata = "Tasks listed here have been marked as "un-implementable" in $language. Solutions may be impossible to do, too complex to be of any valuable instruction in $language, or prohibited by the task definition.

But hey; if you think you can prove us wrong, go for it. :-) "; &postpage($omitcategoryname, $omitcatpagedata, "ImplSearchBot:Updating Omit category body", 0) unless ( exists $createdomitcategories{$omitcategoryname} );

}

  1. To add here: Post stats on activities:
  2. 1. Pages edited
  3. 2. Time last run started
  4. 3. Time to completion from start to just prior to stat post point.

my $runtime = time() - $starttime; my $statsdata = "Pages edited last run: $pagesedited\nTime to post all per-language updates: $runtime seconds\n"; &out("Updating stats page. Runtime ($runtime), Pages edited ($pagesedited)\n", 2);

&postpage("User:ImplSearchBot/Stats", $statsdata, 0);


&out("Updating bot code page\n", 4);

open my $sourcefile, '<', $0 or die "Finished without updating bot source page";

my $botsource; $botsource .= $_ while <$sourcefile>;

close $sourcefile;

my $tag = "lang";

&postpage("User:ImplSearchBot/Code", "<$tag perl>$botsource</$tag>", 0);

&out("Done\n", 2);

exit(0);

sub out { my $string = shift; my $loglevel = shift;

chomp $string;

if($options{'verbosity'} >= $loglevel) { if($loglevel == 1) { warn $string; # use stderr. } else { print "$string\n"; } } }

sub sanitizenamefs { my $pagename = shift; $pagename =~ tr/:\//__/; return $pagename; }

sub diffcat_simple { my $first = shift; my $second = shift; my %firsthash = map { $_, 1 } @$first;

my @new = ();

foreach my $secondelement (@$second) { push @new, $secondelement unless exists $firsthash{$secondelement}; }

return @new; }

sub diffcat { my $first = shift; my $second = shift; my @newinfirst = &diffcat_simple($second, $first); my @newinsecond = &diffcat_simple($first, $second); return (\@newinfirst, \@newinsecond); }

sub postpage { my $pagename = shift; my $pagedata = shift; my $remark = shift; my $minoredit = shift;

++$pagesedited;

unless( exists $options{'post'} ) { $pagename = &sanitizenamefs($pagename); $pagename .= ".wikitxt";

&out("Saving: $pagename\n", 3);

open my $outfile, '>', $pagename or &log("Failed to open $pagename: $!", 1);

return unless defined $outfile;

print $outfile $pagedata; close $outfile; } else { &out("Posting $pagename\n", 3);

$editor->edit($pagename, $pagedata, "ImplSearchBot:$remark", $minoredit) or &out("Failed to post page: " . $editor->{'errstr'}, 1); } }

  1. This allows us to compare site state between now and when we
  2. last ran, so that we can learn to avoid doing unnecessary work.
  3. (Saves on server resources and bloated edit statistics...)
  4. Also, I plan on publishing the cache files in a version control
  5. system like SVN or Git, to allow multiple bots
  6. to share the data and to provide history. We're using JSON
  7. Instead of Perl's native Data::Dumper as JSON has broader
  8. cross-language support, making it easier for others to use the data.
  9. SVN has the advantage that I already know how to use it.
  10. Git has the advantage in that I can use GitHub and not tax
  11. my Slice with Git traffic, and I don't have to punch a hole
  12. in the firewall to access more services.

sub cachedata { my $dataname = shift; my $data = shift; &out("Caching $dataname...", 3); my $filename = &sanitizenamefs("cache_" . $dataname . ".json"); my $outfile; unless(open $outfile, '>', $filename) { &out("Failed to open $filename: $!\n", 1); return; }

print $outfile to_json($data); close $outfile; &out(scalar $data . " members cached.\n", 4); }

sub getcacheddata { my $dataname = shift; &out("Getting cached data for $dataname...", 3); my $filename = &sanitizenamefs("cache_" . $dataname . ".json"); my $infile; unless (open $infile, '<', $filename) { &out("Failed to load cached data $filename: $!\n", 1); return []; }

my $jsondata; $jsondata .= $_ while <$infile>; close $infile;

my $cacheddata = from_json($jsondata); &out(scalar @$cacheddata . " members retrieved\n", 4);

return $cacheddata; }

sub getcategory { my $categoryname = shift; &out("Getting category contents for $categoryname...", 3); my @categorycontents = $editor->get_pages_in_category($categoryname); &out(scalar @categorycontents . " members retrieved\n", 3);

my ($removed, $added) = &diffcat(&getcacheddata($categoryname), \@categorycontents);

&out(scalar @$removed . " removed, " . @$added . " added to $categoryname\n", (scalar @$removed + scalar @$added) > 0 ? 2 : 3);

&cachedata("$categoryname", \@categorycontents);

return @categorycontents; }

sub getwork { my $categoryname = shift; &out("Getting work for $categoryname.\n",3); my $olddata = &getcacheddata($categoryname); my @newdata = &getcategory($categoryname); my ($removed, $added) = &diffcat($olddata, \@newdata); my @work = (); @work = @newdata if((scalar @$removed + scalar @$added) > 0);

&out(scalar @work . " items to process for $categoryname.\n", 3);

return @work; }

sub getlangwork { my $language = shift; &out("Getting lang work for $language\n", 3); my %impl; foreach my $workitem (&getwork("Category:$language")) { $impl{$workitem} = 1; } my %omit; foreach my $workitem (&getwork("Category:$language/Omit")) { $omit{$workitem} = 1; }

# No work to do? return undef if( ( 0 == scalar keys %omit) && (0 == scalar keys %impl));

# work to do. return {'impl' => \%impl, 'omit' => \%omit }; } </lang>