I'm working on modernizing Rosetta Code's infrastructure. Starting with communications. Please accept this time-limited open invite to RC's Slack.. --Michael Mol (talk) 20:59, 30 May 2020 (UTC)

Rosetta Code/List authors of task descriptions

From Rosetta Code
Rosetta Code/List authors of task descriptions is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.
In this task, the goal is to compile an authorship list for task descriptions. A pseudocode example (in imperative style) that should accomplish this is as follows:
for each task page
grab page source, discard everything after the first ==section==.
Cache as $previous. Note $author.
for each revision
grab page source, discard everything after first ==section==.
Cache as $previous2. Note $author2
compare $previous2 to $previous. If different, record $author to $list.
replace $previous with $previous2
replace $author with $author2

The following resources for HTTP interface information for MediaWiki may prove to be useful:

Conversely, some languages have libraries which abstract these interfaces into language-native idioms. Use of these abstractions is perfectly fine.


Please DO NOT add a full output for each programming language; just show a representative sample. For an full listing, see Rosetta_Code/List_authors_of_task_descriptions/Full_list.

Go[edit]

package main
 
import (
"fmt"
"io/ioutil"
"net/http"
"regexp"
"sort"
"strings"
)
 
type authorNumber struct {
author string
number int
}
 
func main() {
ex1 := `<li><a href="/wiki/(.*?)"`
ex2 := `a href="/(wiki/User:|mw/index\.php\?title=User:|wiki/Special:Contributions/)([^"&]+)`
re1 := regexp.MustCompile(ex1)
re2 := regexp.MustCompile(ex2)
url1 := "http://rosettacode.org/wiki/Category:Programming_Tasks"
url2 := "http://rosettacode.org/wiki/Category:Draft_Programming_Tasks"
urls := []string{url1, url2}
var tasks []string
for _, url := range urls {
resp, _ := http.Get(url)
body, _ := ioutil.ReadAll(resp.Body)
// find all tasks
matches := re1.FindAllStringSubmatch(string(body), -1)
resp.Body.Close()
for _, match := range matches {
// exclude any 'category' references
if !strings.HasPrefix(match[1], "Category:") {
tasks = append(tasks, match[1])
}
}
}
authors := make(map[string]int)
for _, task := range tasks {
// check the last or only history page for each task
page := fmt.Sprintf("http://rosettacode.org/mw/index.php?title=%s&dir=prev&action=history", task)
resp, _ := http.Get(page)
body, _ := ioutil.ReadAll(resp.Body)
// find all the users in that page
matches := re2.FindAllStringSubmatch(string(body), -1)
resp.Body.Close()
// the task author should be the final user on that page
author := matches[len(matches)-1][2]
author = strings.ReplaceAll(author, "_", " ")
// add this task to the author's count
authors[author]++
}
// sort the authors in descending order by number of tasks created
authorNumbers := make([]authorNumber, 0, len(authors))
for k, v := range authors {
authorNumbers = append(authorNumbers, authorNumber{k, v})
}
sort.Slice(authorNumbers, func(i, j int) bool {
return authorNumbers[i].number > authorNumbers[j].number
})
// print the top twenty say
fmt.Println("Total tasks  :", len(tasks))
fmt.Println("Total authors :", len(authors))
fmt.Println("\nThe top 20 authors by number of tasks created are:\n")
fmt.Println("Pos Tasks Author")
fmt.Println("=== ===== ======")
lastNumber, lastIndex := 0, -1
for i, authorNumber := range authorNumbers[0:20] {
j := i
if authorNumber.number == lastNumber {
j = lastIndex
} else {
lastIndex = i
lastNumber = authorNumber.number
}
fmt.Printf("%2d:  %3d  %s\n", j+1, authorNumber.number, authorNumber.author)
}
}
Output:

As of 5th March 2020:

Total tasks   : 1237
Total authors : 287

The top 20 authors by number of tasks created are:

Pos  Tasks  Author
===  =====  ======
 1:   178   Paddy3118
 2:    71   Markhobley
 3:    61   Gerard Schildberger
 4:    55   Mwn3d
 5:    39   NevilleDNZ
 6:    33   Short Circuit
 7:    30   Nigel Galloway
 8:    29   Thundergnat
 9:    23   Grondilu
10:    21   Dkf
11:    20   Fwend
11:    20   Blue Prawn
13:    19   CalmoSoft
14:    18   Kernigh
15:    17   ShinTakezou
15:    17   Dmitry-kazakov
15:    17   Ledrug
18:    13   Abu
18:    13   Paulo Jorente
18:    13   Waldorf

Nim[edit]

Translation of: Go
import algorithm, httpclient, re, strutils, tables
 
let
re1 = re("""<li><a href="/wiki/(.*?)"""")
re2 = re("""a href="/wiki/User:|mw/index\.php\?title=User:|wiki/Special:Contributions/([^"&]+)""")
 
const
Url1 = "http://rosettacode.org/wiki/Category:Programming_Tasks"
Url2 = "http://rosettacode.org/wiki/Category:Draft_Programming_Tasks"
Urls = [Url1, Url2]
 
var client = newHttpClient()
 
var tasks: seq[string]
var matches: array[1, string]
var start = 0
for url in Urls:
let body = client.getContent(url)
# Find all tasks.
while true:
start = body.find(re1, matches, start) + 1
if start == 0: break
if not matches[0].startsWith("Category:"):
tasks.add matches[0]
 
var authors: CountTable[string]
for task in tasks:
# Check the last or only history page for each task.
let page = "http://rosettacode.org/mw/index.php?title=$#&dir=prev&action=history".format(task)
let body = client.getContent(page)
# Find all the users in that page. The task author should be the final user on that page.
var matches: array[1, string]
start = 0
while true:
start = body.find(re2, matches, start) + 1
if start == 0: break
let author = matches[0].replace('-', ' ')
# Add this task to the author's count.
authors.inc(author)
 
# Sort the authors in descending order by number of tasks created.
authors.sort(Descending)
 
# Print the top twenty.
echo "Total tasks: ", tasks.len
echo "Total authors: ", authors.len
echo "\nThe top 20 authors by number of tasks created are:\n"
echo "Pos Tasks Author"
echo "=== ===== ======"
var pos = 0
for author, count in authors.pairs:
inc pos
echo ($pos).align(2), " ", ($count).align(3), " ", author
if pos == 20: break
Output:

On 2021-06-29.

The top 20 authors by number of tasks created are:

Pos  Tasks  Author
===  =====  ======
 1    196   Paddy3118
 2     84   CalmoSoft
 3     72   Markhobley
 4     66   Gerard_Schildberger
 5     55   Mwn3d
 6     39   NevilleDNZ
 7     39   Thundergnat
 8     33   Nigel_Galloway
 9     33   Short_Circuit
10     23   Grondilu
11     21   Blue_Prawn
12     20   Fwend
13     20   Dkf
14     18   Kernigh
15     17   Ledrug
16     17   ShinTakezou
17     17   Dmitry kazakov
18     14   Wherrera
19     13   Waldorf
20     13   Abu

Phix[edit]

To keep the output nice and short, lists the top 5 task creators.
Uses a cache: once a .hist file has been downloaded for a given task, it is assumed to be good forever. Each task is about 20K, so it will download around 25MB in total, for >= 1,219 tasks. It does those sequentially, using curl_easy_ handles. I guess if you really wanted to then using curl_multi_ handles would properly thrash the rosettacode servers.

Library: Phix/libcurl
-- demo\rosetta\List_task_authors.exw
include builtins\libcurl.e
atom curl = NULL
atom pErrorBuffer
 
function write_callback(atom pData, integer size, integer nmemb, integer fn)
integer bytes_written = size * nmemb
puts(fn,peek({pData,bytes_written}))
return bytes_written
end function
constant write_cb = call_back({'+', routine_id("write_callback")})
 
integer lp = 0 -- (last \r'd progress message length)
procedure progress(string msg, sequence args = {})
if length(args) then msg = sprintf(msg,args) end if
integer lm = length(msg)
if lm<lp then msg[$..$] = repeat(' ',lp-lm)&msg[$] end if
puts(1,msg)
lp = iff(msg[$]='\r'?lm:0)
end procedure
 
include builtins\timedate.e
-- for [Draft_]Programming_Tasks aka non-.hist files only:
integer refresh_cache = timedelta(days:=31) -- 0 for always
 
function open_download(string filename, url, integer i, n)
bool refetch = false
object text
filename = join_path({"rc_cache",filename})
if file_exists(filename) then
text = trim(get_text(filename))
refetch = (not sequence(text)) or (length(text)<10)
if not refetch and not match(".hist",filename) then
-- use existing file if <= refresh_cache (31 days) old
sequence last_mod = get_file_date(filename) -- (0.8.1+)
atom delta = timedate_diff(last_mod,date())
refetch = (delta>refresh_cache)
end if
else
string directory = get_file_path(filename)
if get_file_type(directory)!=FILETYPE_DIRECTORY then
if not create_directory(directory,make_parent:=true) then
crash("cannot create %s directory",{directory})
end if
end if
refetch = true
end if
if refetch then
progress("Downloading %d/%d %s...\r",{i,n,filename})
if curl=NULL then
curl_global_init()
curl = curl_easy_init()
pErrorBuffer = allocate(CURL_ERROR_SIZE)
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, pErrorBuffer)
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb)
end if
url = substitute(url,"%3A",":")
url = substitute(url,"%2A","*")
curl_easy_setopt(curl, CURLOPT_URL, url)
integer fn = open(filename,"wb")
if fn=-1 then ?9/0 end if
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fn)
while true do
CURLcode res = curl_easy_perform(curl)
if res=CURLE_OK then exit end if
string error = sprintf("%d",res)
if res=CURLE_COULDNT_RESOLVE_HOST then
error &= " [CURLE_COULDNT_RESOLVE_HOST]"
end if
progress("Error %s downloading file, retry?(Y/N):",{error})
if lower(wait_key())!='y' then abort(0) end if
printf(1,"Y\n")
end while
close(fn)
text = get_text(filename)
end if
return text
end function
 
function open_category(string filename, integer i, n)
return open_download(filename&".htm","http://rosettacode.org/wiki/Category:"&filename,i,n)
end function
 
function dewiki(string s)
-- extract tasks from eg `<li><a href="/wiki/100_doors"`
sequence tasks = {}
integer start = 1, finish = match(`<div class="printfooter">`,s)
s = s[1..finish-1]
while true do
start = match(`<li><a href="/wiki/`,s,start)
if start=0 then exit end if
start += length(`<li><a href="/wiki/`)
finish = find('"',s,start)
string task = s[start..finish-1]
task = substitute_all(task,{"*",":"},{"%2A","%3A"})
tasks = append(tasks,task)
start = finish+1
end while
return tasks
end function
 
constant {hex,ascii} = columnize({{"%2A","*"},
{"%3A",":"},
{"%27","'"},
{"%2B","+"},
{"%22",`"`},
{"%E2%80%93","-"},
{"%E2%80%99","'"},
{"%C3%A8","e"},
{"%C3%A9","e"}})
 
function html_clean(string s)
return substitute_all(s,hex,ascii)
end function
 
constant history_user = `<span class='history-user'><a href="`
 
function count_tasks()
sequence tasks = dewiki(open_category("Programming_Tasks",1,2))
& dewiki(open_category("Draft_Programming_Tasks",2,2))
integer ntasks = length(tasks)
sequence users = {},
utask = {},
ntask = {}
for i=1 to ntasks do
string ti = tasks[i],
url = sprintf("http://rosettacode.org/mw/index.php?title=%s&action=history&dir=prev&limit=1",{ti}),
contents = open_download(ti&".hist",url,i,ntasks)
integer k = match(history_user,contents)
if k=0 then ?9/0 end if
k = find('>',contents,k+length(history_user))
if k=0 then ?9/0 end if
k += 1
integer e = match("</a>",contents,k)
if e=0 then ?9/0 end if
string user = contents[k..e-1]
 
k = find(user,users)
if k=0 then
users = append(users,user)
utask = append(utask,{i})
ntask = append(ntask,1)
else
utask[k] &= i
ntask[k] += 1
end if
tasks[i] = html_clean(ti) -- (in case you want to show them)
if get_key()=#1B then progress("escape keyed\n") exit end if
end for
if curl!=NULL then
curl_easy_cleanup(curl)
free(pErrorBuffer)
curl = NULL
pErrorBuffer = NULL
end if
progress("\n")
sequence tags = custom_sort(ntask,tagset(length(ntask)))
integer top5 = 0
for i=length(tags) to 1 by -1 do
integer ui = tags[i]
printf(1,"%s tasks:%d\n",{users[ui],ntask[ui]})
top5 += 1 if top5>5 then exit end if
end for
return ntasks
end function
 
progress("Total: %d\n",{count_tasks()})
Output:

As of 6th Jan 2020

Paddy3118 tasks:176
Markhobley tasks:71
Gerard Schildberger tasks:59
Mwn3d tasks:55
NevilleDNZ tasks:39
Short Circuit tasks:33
Total: 1219

Raku[edit]

(formerly Perl 6)

Works with: Rakudo version 2018.03

The pseudocode above is no longer really useful as the page format has changed significantly since this task was written. Rather than checking every edit to see if it was a change to the task description, we'll just assume the user that created the page is the task author. This isn't 100% accurate; a very few pages got renamed and recreated by someone other than the original author without preserving the history, so they are misreported (15 Puzzle Game for instance,) but is as good as it is likely to get without extensive manual intervention. Subsequent edits to the task description are not credited. As it is, we must still make thousands of requests and pound the server pretty hard. Checking every edit would make the task several of orders of magnitude more abusive of the server (and my internet connection.)

use HTTP::UserAgent;
use URI::Escape;
use JSON::Fast;
use Sort::Naturally;
 
# Friendlier descriptions for task categories
my %cat = (
'Programming_Tasks' => 'Task',
'Draft_Programming_Tasks' => 'Draft'
);
 
my $client = HTTP::UserAgent.new;
 
my $url = 'http://rosettacode.org/mw';
 
my $tablefile = './RC_Authors.txt';
my $hashfile = './RC_Authors.json';
 
my %tasks;
 
# clear screen
run($*DISTRO.is-win ?? 'cls' !! 'clear');
 
%tasks = $hashfile.IO.e ?? $hashfile.IO.slurp.&from-json !! ( );
sleep 1;
 
#=begin update
 
note 'Retrieving task information...';
 
my %filter;
for %cat.keys.sort -> $category {
mediawiki-query(
$url, 'pages',
:generator<categorymembers>,
:gcmtitle("Category:$category"),
:gcmlimit<350>,
:rawcontinue(),
:prop<title>
).map( { %filter{.<title>} = %cat{$category} } )
}
 
my $delete = %tasks.keys (-) %filter.keys;
 
%tasks.delete($_) for $delete.keys; #Tasks that have changed names or been removed
 
my @add;
for %filter.keys -> $title {
if %tasks{$title}:exists {
%tasks{$title}<category> = %filter{$title} # update status
} else {
@add.push: $title => %filter{$title} # New Tasks
}
}
 
if @add {
.say for 'Adding new tasks:', |@add;
}
 
for @add -> $task {
mediawiki-query(
$url, 'pages',
:titles($task.key),
:prop<revisions>,
:rvprop<user|timestamp>,
:rvstart<2000-01-01T01:01:01Z>,
:rvdir<newer>,
:rvlimit<1>
).map: {
print clear, 1 + $++, ' ', .[0]<title>;
%tasks{.[0]<title>}<category> = $task.value;
%tasks{.[0]<title>}<author> = .[0]<revisions>[0]<user>;
%tasks{.[0]<title>}<date> = .[0]<revisions>[0]<timestamp>.subst(/'T'.+$/, '')
}
}
 
print clear;
 
# Save information to a local file
note "\nTask information saved to local file: {$hashfile.IO.absolute}";
$hashfile.IO.spurt(%tasks.&to-json);
 
#=end update
 
# Load information from local file
%tasks = $hashfile.IO.e ?? $hashfile.IO.slurp.&from-json !! ( );
 
# Convert saved task / author info to a table
note "\nBuilding table...";
my $count = +%tasks;
my $taskcnt = +%tasks.grep: *.value.<category> eq %cat<Programming_Tasks>;
my $draftcnt = $count - $taskcnt;
 
# Open a file handle to dump table in
my $out = open($tablefile, :w) or die "$!\n";
 
# Add table boilerplate and header
$out.say:
"\{|class=\"wikitable sortable\"\n",
"|+ As of { Date.today } :: Total Tasks: { $count }:: Tasks: { $taskcnt }",
" ::<span style=\"background-color:#ffd\"> Draft Tasks: { $draftcnt } </span>",
":: By {+%tasks{*}».<author>.unique} Authors\n",
"! Author !! Tasks !! Authored"
;
 
# Get sorted unique list of task authors
for %tasks{*}».<author>.unique.sort(&naturally) -> $author {
 
# Get list of tasks by this author
my @these = %tasks.grep( { $_.value.<author> eq $author } );
my $s = +@these == 1 ?? '' !! 's';
 
# Add author and contributions link to the first two cells
$out.say:
$author ~~ /\d/
?? "|- id=\"$author\"\n|data-sort-value=\"{ sort-key $author }\"|[[User:$author|$author]]\n"~
"|data-sort-value=\"{ [email protected] }\"|[[Special:Contributions/$author|"~
"{ [email protected] } task{ $s }]]"
!! "|- id=\"$author\"\n|[[User:$author|$author]]\n"~
"|data-sort-value=\"{ [email protected] }\"|[[Special:Contributions/$author|"~
"{ [email protected] } task{ $s }]]"
;
 
if +@these > 2 {
$out.say: "|style=\"padding: 0px;\"|\n",
"\{|class=\"broadtable sortable\" style=\"width: 100%;\"\n",
"! Task Name !! Date Added !! Status";
}
else {
$out.say: "|style=\"padding: 0px;\"|\n",
"\{|class=\"broadtable\" style=\"width: 100%;\"";
}
 
# Tasks by this author, sorted by name
for @these.sort({.key.&naturally}) -> $task {
 
my $color = $task.value.<category> eq 'Draft' ?? '#ffd' !! '#fff';
 
# add the task link, date and status to the table in the second cell
$out.say: "|-\n|style=\"background-color: $color;\"",
( $task.key ~~ /\d/
?? " data-sort-value=\"{ sort-key $task.key }\"| [[{uri-escape $task.key}|{$task.key}]]\n"
!! "| [[{uri-escape $task.key}|{$task.key}]]\n"
),
"|style=\"width: 10em; background-color: $color;\"| {$task.value.<date>}\n",
"|style=\"width: 6em; background-color: $color;\"| {$task.value.<category>}",
}
$out.say: '|}'
}
$out.say( "|}\n" );
$out.close;
 
 
note "Table file saved as: {$tablefile.IO.absolute}";
 
sub mediawiki-query ($site, $type, *%query) {
my $url = "$site/api.php?" ~ uri-query-string(
:action<query>, :format<json>, :formatversion<2>, |%query);
my $continue = '';
 
gather loop {
my $response = $client.get("$url&$continue");
my $data = from-json($response.content);
take $_ for $data.<query>.{$type}.values;
$continue = uri-query-string |($data.<query-continue>{*}».hash.hash or last);
}
}
 
sub uri-query-string (*%fields) { %fields.map({ "{.key}={uri-escape .value}" }).join("&") }
 
sub sort-key ($a) { $a.lc.subst(/(\d+)/, ->$/ {0~(65+($0.chars)).chr~$0},:g) }
 
sub clear { "\r" ~ ' ' x 100 ~ "\r" }
Sample output:

See full output at Rosetta_Code/List_authors_of_task_descriptions/Full_list

As of 2018-04-10 :: Total Tasks: 1080:: Tasks: 871 :: Draft Tasks: 209 :: By 251 Authors
Author Tasks Authored
2Powers 2 tasks
Names to numbers 2013-05-16 Draft
Solving coin problems 2013-05-16 Draft
12.175.32.19 1 task
Soundex 2009-11-12 Task
12Me21 1 task
Draw a rotating cube 2015-05-04 Task
many rows omitted...
Zorro1024 2 tasks
Perfect shuffle 2015-04-16 Task
Vector 2015-03-21 Draft
Zzo38 1 task
Thue-Morse 2015-09-20 Task
Русский 3 tasks
Task Name Date Added Status
Main step of GOST 28147-89 2012-08-31 Task
Old Russian measure of length 2013-01-09 Draft
Transportation problem 2013-05-24 Draft