Rosetta Code/List authors of task descriptions: Difference between revisions
Thundergnat (talk | contribs) m (update full list) |
Thundergnat (talk | contribs) (→{{header|Perl 6}}: Refactor to make more modular/resumable. Adjust table spacing, now shows date task added and author links) |
||
Line 272: | Line 272: | ||
{{works with|Rakudo|2017.08}} |
{{works with|Rakudo|2017.08}} |
||
The pseudocode above is no longer really useful as the page format has changed. Rather than checking '''every''' edit to see if it was a change to the task description, we'll just assume the user that created the page is the task author. This isn't 100% accurate; a very few pages got renamed and recreated by someone other than the original author without preserving the history, so they are misreported (15 Puzzle Game for instance,) but is as good as it is likely to get without extensive manual intervention. |
The pseudocode above is no longer really useful as the page format has changed significantly sine ths task was written. Rather than checking '''every''' edit to see if it was a change to the task description, we'll just assume the user that created the page is the task author. This isn't 100% accurate; a very few pages got renamed and recreated by someone other than the original author without preserving the history, so they are misreported (15 Puzzle Game for instance,) but is as good as it is likely to get without extensive manual intervention. Subsequent edits to the task description are not credited. As it is, we must still make ''thousands'' of requests and pound the server pretty hard. Checking '''every''' edit would make the task several of orders of magnitude more abusive of the server (and my internet connection.) |
||
The task names and author information are saved to local files so it can pick up where it left off if it gets interrupted during processing. As the task creation time (and original editor) never change, don't bother to re-download every time. Just update the category (Draft or Task) as that is the only thing that really changes. If a task name gets edited, manual intervention is required. Either edit the JSON file with the task information or just delete it and recreate it from scratch. |
|||
Each stage of the scraping process is saved to local files so it can be restarted without losing all your progress in the event of a timeout or error. If that happens though, you need to manually adjust where to restart the process. |
|||
<lang perl6>use HTTP::UserAgent; |
<lang perl6>use HTTP::UserAgent; |
||
use Gumbo; |
use Gumbo; |
||
use Sort::Naturally; |
use Sort::Naturally; |
||
use JSON::Fast; |
|||
my $ua = HTTP::UserAgent.new; |
my $ua = HTTP::UserAgent.new; |
||
Line 284: | Line 285: | ||
for 'Programming_Tasks', 'Draft_Programming_Tasks' -> $category |
for 'Programming_Tasks', 'Draft_Programming_Tasks' -> $category |
||
{ # Get lists of Tasks & Draft Tasks |
{ # Get lists of Tasks & Draft Tasks |
||
# |
#last; # Uncomment to skip this step |
||
say "Updating $category list..."; |
|||
my $page = "http://rosettacode.org/wiki/Category:$category"; |
|||
my $html |
my $html = $ua.get($page).content; |
||
my $xmldoc = parse-html($html, :TAG<div>, :id<mw-pages>); |
my $xmldoc = parse-html($html, :TAG<div>, :id<mw-pages>); |
||
my @tasks = parse-html($xmldoc[0].Str, :TAG<li>).Str.comb( /'/wiki/' <-["]>+ / ) |
my @tasks = parse-html($xmldoc[0].Str, :TAG<li>).Str.comb( /'/wiki/' <-["]>+ / )».substr(6); #'" |
||
my $f = open("./RC_{$category}.txt", :w) or die "$!\n"; |
my $f = open("./RC_{$category}.txt", :w) or die "$!\n"; |
||
note "Writing $category file..."; |
|||
$f.print( @tasks.join("\n") ); |
$f.print( @tasks.join("\n") ); |
||
$f.close; |
$f.close; |
||
} |
} |
||
my %cat = ( # Friendlier descriptions for task categories |
|||
⚫ | |||
'Draft_Programming_Tasks' => 'Draft:' |
|||
⚫ | |||
# Month names for date manipulations |
|||
my %months = <January February March April May June July August |
|||
September October November December> Z=> ^12; |
|||
my $hashfile = './RC_hash.json'; |
|||
my $htmlfile = './RC_Authors.html'; |
|||
note "Reading JSON hash file..."; |
|||
my %tasks = $hashfile.IO.e ?? $hashfile.IO.slurp.&from-json !! ( ); |
|||
for 'Programming_Tasks', 'Draft_Programming_Tasks' -> $category |
for 'Programming_Tasks', 'Draft_Programming_Tasks' -> $category |
||
{ # Scrape info from each page. |
{ # Scrape info from each page. |
||
# |
#last; # Uncomment to skip this step |
||
note "Loading $category file..."; |
|||
my @entries = "./RC_{$category}.txt".IO.slurp.lines; |
|||
for @ |
for @entries -> $title { |
||
# Update the category as that is the only thing that can really change. |
|||
%tasks{$title}{'category'} = %cat{$category}; |
|||
# Otherwise skip if it has already been indexed. The creation date can't change |
|||
# the task name *can* change, but it is exceedinly rare |
|||
if %tasks{$title}{'title'}:exists { |
|||
⚫ | |||
⚫ | |||
⚫ | |||
my $ua = HTTP::UserAgent.new; |
|||
# Get the earliest edit |
# Get the earliest edit |
||
my $ |
my $html = $ua.get: "http://rosettacode.org/mw/index.php?title={$title}&dir=prev&limit=1&action=history"; |
||
# Filter out the actual history links |
|||
⚫ | |||
⚫ | |||
# Only interested in the oldest (last in the list) |
|||
my $line = $0.lines.tail; |
my $line = $0.lines.tail; |
||
# Parse out the User name |
# Parse out the User name |
||
$line ~~ m| 'title="User:' <-[>]>+? '>' (.+?) '</a>' |; |
$line ~~ m| 'title="User:' <-[>]>+? '>' (.+?) '</a>' |; |
||
⚫ | |||
⚫ | |||
# Oops, no user name, must be anonymous, get IP address instead |
# Oops, no user name, must be anonymous, get IP address instead |
||
unless $auth { |
unless $auth { |
||
$line ~~ m| '"mw-userlink mw-anonuserlink">' (.+?) '</a>' |; |
$line ~~ m| '"mw-userlink mw-anonuserlink">' (.+?) '</a>' |; |
||
$auth = $0; |
$auth = $0.Str; |
||
} |
} |
||
%tasks{$title}{'author'} = $auth; |
|||
# Parse out human readable title |
# Parse out human readable title |
||
$line ~~ m| '<a href="/mw/index.php?title=' $title '&' .+? 'title="'(<-["]>+)'"' |; #"' |
$line ~~ m| '<a href="/mw/index.php?title=' $title '&' .+? 'title="'(<-["]>+)'"' |; #"' |
||
%tasks{$title}{'title'} = $0.Str; |
|||
# Parse out date task was added, convert date to ISO format |
|||
⚫ | |||
$line ~~ m| 'class="mw-changeslist-date">' <-[\s]>+ (<-[<]>+) '</a>' |; |
|||
%tasks{$title}{'date'} = $0.Str.trim.&toISO8601; |
|||
# report progress |
# report progress |
||
note $title; |
|||
# save |
# save to a file |
||
$hashfile.IO.spurt(%tasks.&to-json); |
|||
my $f = open("./RC_Authors.txt", :a) or die "$!\n"; |
|||
$f.say( "[[$title|$decoded]]\t$category\t$auth" ); |
|||
⚫ | |||
sleep 3; # Don't pound the server |
sleep 3; # Don't pound the server |
||
Line 337: | Line 364: | ||
} |
} |
||
# |
# Convert saved task / author info to an HTML table |
||
note "Building HTML table..."; |
|||
my %authors; |
|||
my |
my $count = +%tasks; |
||
my $taskcnt = +%tasks.grep: *.value.<category> eq %cat<Programming_Tasks>; |
|||
"./RC_Authors.txt".IO.slurp.lines.map: { |
|||
my $draftcnt = $count - $taskcnt; |
|||
⚫ | |||
if $cat.contains('Draft') { |
|||
$cat = 'Draft:'; |
|||
$draftcnt++; |
|||
} else { |
|||
⚫ | |||
$taskcnt++; |
|||
⚫ | |||
%authors{$auth}.push: "$cat $task"; |
|||
⚫ | |||
# Dump an HTML table to a file |
# Dump an HTML table to a file |
||
my $out = open( |
my $out = open($htmlfile, :w) or die "$!\n"; |
||
# Add table boilerplate and header |
|||
$out.say( '<table border="1" cellpadding="4"><tr><th colspan="2">As of ', Date.today, ' | Total: ', |
$out.say( '<table border="1" cellpadding="4"><tr><th colspan="2">As of ', Date.today, ' | Total: ', |
||
"$count / Tasks: $taskcnt / Draft Tasks: $draftcnt", |
|||
'<tr><th>User</th><th>Authored</th></tr>' ); |
|||
⚫ | |||
# Get sorted unique list of task authors |
|||
⚫ | |||
⚫ | |||
$out.print( $a.value.sort( *.substr(7) ).join('</li><li>') ); |
|||
$out. |
$out.print( '<tr><td><ul>[[User:', $author, '|', $author, ']]</ul></td><td><ul><ol>' ); |
||
# Get list of tasks by this author, sorted by name |
|||
for %tasks.grep( { $_.value.<author> eq $author } ).sort(*.key.&naturally) -> $task { |
|||
# and add them |
|||
$out.print( "<li>{$task.value.<date>} - {$task.value.<category>}", |
|||
" [[{$task.key}|{$task.value.<title>}]]</li>" |
|||
⚫ | |||
⚫ | |||
⚫ | |||
} |
} |
||
$out.say( '</table>' ); |
$out.say( '</table>' ); |
||
$out.close; |
$out.close; |
||
say "HTML file saved as: {$htmlfile.IO.absolute}"; |
|||
sub toISO8601 ($date) { # convert day month year to YYYY-MM-DD |
|||
my @dmy = $date.split: ' '; |
|||
sprintf "%4d-%02d-%02d", @dmy[2].Int, %months{@dmy[1]}, @dmy[0].Int; |
|||
} |
|||
</lang> |
|||
;Sample output |
;Sample output |
||
<table border="1"><tr><th colspan="2">As of 2017- |
<table border="1" cellpadding="4"><tr><th colspan="2">As of 2017-10-10 | Total: 1067 / Tasks: 859 / Draft Tasks: 208<tr><th>User</th><th>Authored</th></tr> |
||
<tr><td>2Powers</td><td><ol><li>Draft: [[Names_to_numbers|Names to numbers]]</li><li>Draft: [[Solving_coin_problems|Solving coin problems]]</ol></td></tr> |
<tr><td><ul>[[User:2Powers|2Powers]]</ul></td><td><ul><ol><li>2013-04-16 - Draft: [[Names_to_numbers|Names to numbers]]</li><li>2013-04-16 - Draft: [[Solving_coin_problems|Solving coin problems]]</li></ol></ul></td></tr> |
||
<tr><td>12.175.32.19</td><td><ol><li>Task: [[Soundex|Soundex]]</ol></td></tr> |
<tr><td><ul>[[User:12.175.32.19|12.175.32.19]]</ul></td><td><ul><ol><li>2009-10-12 - Task: [[Soundex|Soundex]]</li></ol></ul></td></tr> |
||
<tr><td>12Me21</td><td><ol><li>Task: [[Draw_a_rotating_cube|Draw a rotating cube]]</ol></td></tr> |
<tr><td><ul>[[User:12Me21|12Me21]]</ul></td><td><ul><ol><li>2015-04-04 - Task: [[Draw_a_rotating_cube|Draw a rotating cube]]</li></ol></ul></td></tr> |
||
<tr><td colspan='2'><br/> Many rows omitted... <br/></td></tr> |
<tr><td colspan='2'><br/> Many rows omitted... <br/></td></tr> |
||
<tr><td>Zorro1024</td><td><ol><li>Task: [[Perfect_shuffle|Perfect shuffle]]</li><li>Draft: [[Vector|Vector]]</ol></td></tr> |
<tr><td><ul>[[User:Zorro1024|Zorro1024]]</ul></td><td><ul><ol><li>2015-03-16 - Task: [[Perfect_shuffle|Perfect shuffle]]</li><li>2015-02-21 - Draft: [[Vector|Vector]]</li></ol></ul></td></tr> |
||
<tr><td>Zzo38</td><td><ol><li>Task: [[Thue-Morse|Thue-Morse]]</ol></td></tr> |
<tr><td><ul>[[User:Zzo38|Zzo38]]</ul></td><td><ul><ol><li>2015-08-20 - Task: [[Thue-Morse|Thue-Morse]]</li></ol></ul></td></tr> |
||
<tr><td>Русский</td><td><ol><li>Task: [[Main_step_of_GOST_28147-89|Main step of GOST 28147-89]]</li><li>Draft: [[Old_Russian_measure_of_length|Old Russian measure of length]]</li><li>Draft: [[Transportation_problem|Transportation problem]]</ol></td></tr> |
<tr><td><ul>[[User:Русский|Русский]]</ul></td><td><ul><ol><li>2012-07-31 - Task: [[Main_step_of_GOST_28147-89|Main step of GOST 28147-89]]</li><li>2013-00-09 - Draft: [[Old_Russian_measure_of_length|Old Russian measure of length]]</li><li>2013-04-24 - Draft: [[Transportation_problem|Transportation problem]]</li></ol></ul></td></tr> |
||
</table> |
</table> |
Revision as of 23:57, 10 October 2017
In this task, the goal is to compile an authorship list for task descriptions. A pseudocode example (in imperative style) that should accomplish this is as follows:
<lang pseudocode>for each task page
grab page source, discard everything after the first ==section==.
Cache as $previous. Note $author.
for each revision grab page source, discard everything after first ==section==.
Cache as $previous2. Note $author2
compare $previous2 to $previous. If different, record $author to $list. replace $previous with $previous2 replace $author with $author2</lang>
The following resources for HTTP interface information for MediaWiki may prove to be useful:
- https://www.mediawiki.org/wiki/Index.php#Raw
- https://www.mediawiki.org/wiki/Index.php#History
- https://www.mediawiki.org/wiki/API:Main_page
Conversely, some languages have libraries which abstract these interfaces into language-native idioms. Use of these abstractions is perfectly fine.
Please DO NOT add a full output for each programming language; just show a representative sample. One full list is useful. Multiple full lists just use space and bandwidth.
Perl 6
The pseudocode above is no longer really useful as the page format has changed significantly sine ths task was written. Rather than checking every edit to see if it was a change to the task description, we'll just assume the user that created the page is the task author. This isn't 100% accurate; a very few pages got renamed and recreated by someone other than the original author without preserving the history, so they are misreported (15 Puzzle Game for instance,) but is as good as it is likely to get without extensive manual intervention. Subsequent edits to the task description are not credited. As it is, we must still make thousands of requests and pound the server pretty hard. Checking every edit would make the task several of orders of magnitude more abusive of the server (and my internet connection.)
The task names and author information are saved to local files so it can pick up where it left off if it gets interrupted during processing. As the task creation time (and original editor) never change, don't bother to re-download every time. Just update the category (Draft or Task) as that is the only thing that really changes. If a task name gets edited, manual intervention is required. Either edit the JSON file with the task information or just delete it and recreate it from scratch.
<lang perl6>use HTTP::UserAgent; use Gumbo; use Sort::Naturally; use JSON::Fast;
my $ua = HTTP::UserAgent.new;
for 'Programming_Tasks', 'Draft_Programming_Tasks' -> $category { # Get lists of Tasks & Draft Tasks
#last; # Uncomment to skip this step say "Updating $category list..."; my $page = "http://rosettacode.org/wiki/Category:$category"; my $html = $ua.get($page).content;
my $xmldoc = parse-html($html, :TAG
- Month names for date manipulations
# Only interested in the oldest (last in the list) my $line = $0.lines.tail;
# Parse out the User name $line ~~ m| 'title="User:' <-[>]>+? '>' (.+?) '</a>' |; my $auth = $0 ?? $0.Str !! ; # Oops, no user name, must be anonymous, get IP address instead unless $auth { $line ~~ m| '"mw-userlink mw-anonuserlink">' (.+?) '</a>' |; $auth = $0.Str; } %tasks{$title}{'author'} = $auth;
# Parse out human readable title $line ~~ m| '<a href="/mw/index.php?title=' $title '&' .+? 'title="'(<-["]>+)'"' |; #"' %tasks{$title}{'title'} = $0.Str;
# Parse out date task was added, convert date to ISO format $line ~~ m| 'class="mw-changeslist-date">' <-[\s]>+ (<-[<]>+) '</a>' |; %tasks{$title}{'date'} = $0.Str.trim.&toISO8601;
# report progress note $title;
# save to a file $hashfile.IO.spurt(%tasks.&to-json);
sleep 3; # Don't pound the server }
}
- Convert saved task / author info to an HTML table
note "Building HTML table..."; my $count = +%tasks; my $taskcnt = +%tasks.grep: *.value.<category> eq %cat<Programming_Tasks>; my $draftcnt = $count - $taskcnt;
- Dump an HTML table to a file
my $out = open($htmlfile, :w) or die "$!\n";
- Add table boilerplate and header
- Get sorted unique list of task authors
for %tasks{*}».<author>.unique.sort(*.&naturally) -> $author {
$out.print( '' );}
$out.say( 'As of ', Date.today, ' | Total: ',
"$count / Tasks: $taskcnt / Draft Tasks: $draftcnt",' | |
---|---|
User | Authored |
# Get list of tasks by this author, sorted by name for %tasks.grep( { $_.value.<author> eq $author } ).sort(*.key.&naturally) -> $task { # and add them$out.print( " |
$out.close;
say "HTML file saved as: {$htmlfile.IO.absolute}";
sub toISO8601 ($date) { # convert day month year to YYYY-MM-DD
my @dmy = $date.split: ' '; sprintf "%4d-%02d-%02d", @dmy[2].Int, %months{@dmy[1]}, @dmy[0].Int;
} </lang>
- Sample output
As of 2017-10-10 | Total: 1067 / Tasks: 859 / Draft Tasks: 208 | |
---|---|
User | Authored |
| |
| |
| |
Many rows omitted... | |
| |
| |
|