Rosetta Code:Village Pump/Lang-tag bot/Source

From Rosetta Code
Revision as of 13:40, 16 November 2009 by Underscore (talk | contribs) (Changed to persistently keep track of which pages have been edited and to tag APL. Added to Category:Perl.)

<lang perl>use warnings; use strict; use feature 'say'; use List::Util '&minstr'; use HTML::Entities '&decode_entities'; use YAML::XS qw(&DumpFile &LoadFile); use MediaWiki::API;

use constant DELAY_BETWEEN_EDITS => 10*60; # In seconds.

my $username = 'UnderBot'; my $password = 'secret';

my $kill_switch_page = 'User talk:UnderBot'; my $kill_switch_trigger = qr/stopediting/;

my $tasks_path = '/home/hippo/Temporary/tasks.yml';

my %langtags = ( '4d' => '4d', 'actionscript' => 'actionscript', 'ada' => 'ada', 'agda2' => 'agda2', 'algol 60' => 'algol60', 'algol 68' => 'algol68', 'amigae' => 'amigae', 'apl' => 'apl', 'applescript' => 'applescript', 'assembly' => 'asm', 'autohotkey' => 'autohotkey', 'awk' => 'awk', 'bc' => 'bc', 'befunge' => 'befunge', 'brainf***' => 'bf', 'caml' => 'caml', 'c' => 'c', 'c++' => 'cpp', 'clean' => 'clean', 'clojure' => 'lisp', 'cobol' => 'cobol', 'coldfusion' => 'cfm', 'common lisp' => 'lisp', 'component pascal' => 'pascal', 'coq' => 'coq', 'c sharp|c#' => 'csharp', 'c sharp' => 'csharp', 'dc' => 'dc', 'd' => 'd', 'delphi' => 'delphi', 'dos batch file' => 'dos', 'ec' => 'ec', 'e' => 'e', 'eiffel' => 'eiffel', 'ella' => 'ella', 'emacs lisp' => 'lisp', 'erlang' => 'erlang', 'esql' => 'sql', 'factor' => 'factor', 'false' => 'false', 'fan' => 'fan', 'f' => 'f', 'forth' => 'forth', 'fortran' => 'fortran', 'fp' => 'fp', 'f sharp|f#' => 'fsharp', 'f_sharp|f#' => 'fsharp', 'gap' => 'gap', 'genyris' => 'genyris', 'gnuplot' => 'gnuplot', 'go' => 'go', 'groovy' => 'groovy', 'haskell' => 'haskell', 'haxe' => 'haxe', 'hq9+' => 'hq9p', 'html' => 'html4strict', 'icon' => 'icon', 'idl' => 'idl', 'io' => 'io', 'javafx script' => 'javafx', 'java' => 'java', 'javascript' => 'javascript', 'j' => 'j', 'jocaml' => 'jocaml', 'joy' => 'joy', 'jscript.net' => 'jscript.net', 'json' => 'json', 'judoscript' => 'judoscript', 'korn shell' => 'korn', 'labview' => 'labview', 'latex' => 'latex', 'lisaac' => 'lisaac', 'lisp' => 'lisp', 'logo' => 'logo', 'logtalk' => 'logtalk', 'lotusscript' => 'lotusscript', 'lse64' => 'lse64', 'lua' => 'lua', 'lucid' => 'lucid', 'm4' => 'm4', 'make' => 'make', 'maple' => 'maple', 'mathematica' => 'mathematica', 'matlab' => 'matlab', 'maxima' => 'maxima', 'maxscript' => 'maxscript', 'metafont' => 'metafont', 'mirc scripting language' => 'mirc', 'mmix' => 'mmix', 'modula-2' => 'modula2', 'modula-3' => 'modula3', 'moo' => 'moo', 'mpif90' => 'mpif90', 'ms sql' => 'sql', 'mysql' => 'sql', 'newlisp' => 'lisp', 'nial' => 'nial', 'oberon-2' => 'oberon2', 'objective-c' => 'objc', 'object pascal' => 'objectpascal', 'ocaml' => 'ocaml', 'octave' => 'octave', 'omega' => 'omega', 'openedge/progress' => 'openedge', 'oz' => 'oz', 'pari/gp' => 'parigp', 'pascal' => 'pascal', 'perl 6' => 'perl6', 'perl' => 'perl', 'php' => 'php', 'pike' => 'pike', 'plaintex' => 'tex', 'pl/i' => 'pli', 'pl/pgsql' => 'plpgsql', 'pl/sql' => 'plsql', 'pop11' => 'pop11', 'postgresql' => 'sql', 'postscript' => 'postscript', 'powerbasic' => 'powerbasic', 'powershell' => 'powershell', 'prolog' => 'prolog', 'pure' => 'pure', 'python' => 'python', 'q' => 'q', 'rapidq' => 'rapidq', 'raven' => 'raven', 'rexx' => 'rexx', 'rhope' => 'rhope', 'r' => 'r', 'ruby' => 'ruby', 'sas' => 'sas', 'scala' => 'scala', 'scheme' => 'scheme', 'script3d' => 'script3d', 'seed7' => 'seed7', 'self' => 'self', 'setl' => 'setl', 'slate' => 'slate', 'smalltalk' => 'smalltalk', 'smeql' => 'smeql', 'snusp' => 'snusp', 'sql' => 'sql', 'standard ml' => 'sml', 'supercollider' => 'supercollider', 'svg' => 'xml', 'tcl' => 'tcl', 'ti-83 basic' => 'ti83b', 'ti-89 basic' => 'ti89b', 'toka' => 'toka', 'transact-sql' => 'sql', 'tr' => 'tr', 'twelf' => 'twelf', 'unixpipes' => 'bash', 'unix shell' => 'bash', 'unlambda' => 'unlambda', 'ursala' => 'ursala', 'vbscript' => 'vbscript', 'vedit macro language' => 'vedit', 'visual basic .net' => 'vbnet', 'visual basic' => 'vb', 'visual objects' => 'visobj', 'vorpal' => 'vorpal', 'v' => 'v', 'wrapl' => 'wrapl', 'xquery' => 'xquery', 'xslt' => 'xml', 'xtalk' => 'xtalk', );

my $h = qr/(?:\t| )*/;

# Vaguely like Perl 6's \h.

my $lwsl = qr/(?:\t| )+\S[^\n]*/;

# Leading WhiteSpace Line.
  1. ------------------------------------------------------------

our (%tasks, @done, @todo); local *tasks = LoadFile $tasks_path; local *done = $tasks{done}; local *todo = $tasks{todo};

my $mw = new MediaWiki::API({api_url => 'http://rosettacode.org/mw/api.php'}); $mw->login({lgname => $username, lgpassword => $password})

  or die q(Couldn't log in.);

while (@todo)

  {my $pagetitle = shift @todo;
   say "TITLE: $pagetitle";
   # Check the kill switch.
   my $p = $mw->get_page({title => $kill_switch_page}) || die;
   $p->{'*'} =~ $kill_switch_trigger and die "Killed.\n";
   $p = $mw->get_page({title => $pagetitle}) || die;
   my $timestamp = $p->{timestamp}; # To prevent edit conflicts.
   my $text = $p->{'*'};
   $text =~ s/ (.+? \n) (== \s* {{) /$2/xs or die;
   my $newtext = $1;
     # So $newtext just contains the task description so far.
   while ($text =~ s! \A
            ( == $h Template:$h header $h \ $h == $h \n )
            (.+?)
            ( \z | == $h {{ )
             !$4!xs)
      {my ($header, $langname, $body) = ($1, $2, $3);
       s/\bC #/C#/ foreach $header, $langname;
         # Why some people put a space there, I have no idea.
       my $tag = $langtags{lc $langname} ||
           $langname =~ /assembl/i && 'asm' ||
           ## BASIC dialect-guessing is commented out because
           ## on some pages, programs for more than one dialect
           ## appear under "BASIC". Really we ought to treat
           ## each dialect as its own language.
           #$langname =~ /basic/i &&
           #   ($body =~ /q(uick)?basic/i && 'qbasic' ||
           #    $body =~ /f(ree)?basic/i && 'freebasic' ||
           #    $body =~ /t(hin)?basic/i && 'thinbasic') ||
           undef;
       if ($tag)
          {$tag = "<lang $tag>";
           if ($body =~ /<lang/)
              {# Use the correct identifier.
               $body =~ s {$h (<lang [^>]* >)}
                  {my $s = $1;
                   lc($s) eq $tag || $s =~ /java5/i
                   ? $s # Don't replace "Mathematica" with "mathematica" or "java5" with "java"
                   : $tag}gxe;
               # Get rid of any indenting spaces left behind when
               # someone else added the lang tags.
               lc($langname) eq 'whitespace' or $body =~ s
                   {(<lang [^>]* >) ((?:$h\n)*) (.+?) \s* \x3c/lang>}
                   {my ($t, $leading, $b) = ($1, $2, $3);
                    if ($b !~ /^\S/m and ($leading or $b =~ /\n/))
                      # It there's no newline in $b, the
                      # indentation is probably intentional (as
                      # in many J examples).
                       {my $space = minstr($b =~ /^( +)/gm);
                        $b =~ s/^$space//gm;}
                     "$t$b\x3c/lang>"}xges;}

elsif ($body =~ /

"}xseg;}
                      # HTML entities don't work in lang tags.
                      # But they aren't necessary, either.
            else
              # Turn indented passages into lang-tagged passages.
               {$body =~ s
                   { (  ^ $lwsl \n
                      (?: (?: $lwsl \n | $h \n )*
                           $lwsl \n )? ) }
                   {my $t = $1;
                    my $space = minstr($t =~ /^( +)/gm);
                    $t =~ s/^$space//gm;
                    $t =~ s/\s+\z//;
                    decode_entities("$tag$t\x3c/lang>\n");}mgex;}}

        $body =~ s
            {(<lang [^>]*>)  \s* (.+?) \s*  \x3c/lang>}
            {$1$2\x3c/lang>}gsx;
        $newtext .= $header . $body;}

     $newtext .= $text;
     $newtext =~ s/\s*\z/\n/;
     
     my $success = $mw->edit
       ({action => 'edit',
             title => $pagetitle,
             basetimestamp => $timestamp,
             text => $newtext,
             minor => 1,
               # All we're doing, ultimately, is formatting.
             bot => 1,
             nocreate => 1,
               # If the page was deleted while we were regexing,
               # we probably shouldn't resurrect it!
             summary => 'Fixed lang tags (automatic edit).'},
        {skip_encoding => 1});
           # Without the skip_encoding option, non-ASCII characters
           # will get corrupted.

     if ($success)
        {say 'Committed!';
         push @done, $pagetitle;}
     else
       # Probably an edit conflict.
        {say "Couldn't commit; I'll try again later.";
         push @todo, $pagetitle;}
     DumpFile $tasks_path, \%tasks;

     sleep DELAY_BETWEEN_EDITS;}</lang>