Rosetta Code/Tasks without examples: Difference between revisions

(Added Go)
Line 294:
Since downloading all the pages can be very slow, this uses a cache. Limiting by "Phix" fairly obviously speeds it up tenfold :-)<br>
Output similar to zkl, I assume the first four constants are self-explanatory.
<lang Phix>-- demo\rosetta\Tasks_without_examples.exw
constant output_html = true,
include_drafts = true,
summary = false,
notlang = "Phix" -- "" for all
include builtins\timedate.e
integer refresh_cache = timedelta(days:=30) -- 0 for always
include builtins\libcurl.e
atom curl = NULL
atom pErrorBuffer
function write_callback(atom pData, integer size, integer nmemb, integer fn)
integer bytes_written = size * nmemb
return bytes_written
end function
constant write_cb = call_back({'+', routine_id("write_callback")})
function open_download(string filename, url)
bool refetch = true
if get_file_type("rc_cache")!=FILETYPE_DIRECTORY then
if not create_directory("rc_cache") then
crash("cannot create rc_cache directory")
end if
end if
filename = join_path({"rc_cache",filename})
if file_exists(filename) then
-- use existing file if <= refresh_cache (30+ days) old
sequence last_mod = get_file_date(filename) -- (0.8.1+)
atom delta = timedate_diff(last_mod,date())
refetch = (delta>refresh_cache)
string directory = get_file_path(filename)
if get_file_type(directory)!=FILETYPE_DIRECTORY then
if not create_directory(directory,make_parent:=true) then
crash("cannot create %s directory",{directory})
end if
end if
end if
if refetch then
printf(1,"Downloading %s...\n",{filename})
if curl=NULL then
curl = curl_easy_init()
pErrorBuffer = allocate(CURL_ERROR_SIZE)
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, pErrorBuffer)
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb)
end if
url = substitute(url,"%3A",":")
url = substitute(url,"%2A","*")
curl_easy_setopt(curl, CURLOPT_URL, url)
integer fn = open(filename,"wb")
if fn=-1 then ?9/0 end if
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fn)
CURLcode res = curl_easy_perform(curl)
if res!=CURLE_OK then
string error = sprintf("%d",res)
end if
printf(1, "Error %s downloading file\n", error)
{} = wait_key()
end if
refresh_cache += timedelta(days:=1) -- did I mention it is slow?
end if
return get_text(filename)
end function
function open_category(string filename)
return open_download(filename&".htm",""&filename)
end function
function dewiki(string s)
sequence tasks = {}
integer start = 1, finish = match(`<div class="printfooter">`,s)
s = s[1..finish-1]
while true do
start = match("<li><a href=\"/wiki/",s,start)
if start=0 then exit end if
start += length("<li><a href=\"/wiki/")
finish = find('"',s,start)
string task = s[start..finish-1]
task = substitute(task,"*","%2A")
task = substitute(task,":","%3A")
tasks = append(tasks,task)
start = finish+1
end while
return tasks
end function
function extract_tasks()
-- extract tasks from eg `<li><a href="/wiki/100_doors"`
sequence tasks = dewiki(open_category("Programming_Tasks"))
if include_drafts then
tasks &= dewiki(open_category("Draft_Programming_Tasks"))
end if
if length(notlang) then
-- filter already done in specified language
string langurl = ""&notlang
sequence done = dewiki(open_download(notlang&".htm",langurl))
integer k = 0
for i=1 to length(tasks) do
if not find(tasks[i],done) then
k += 1
tasks[k] = tasks[i]
end if
end for
tasks = tasks[1..k]
done = {}
end if
if not summary then
-- replace with contents
for i=1 to length(tasks) do
string ti = tasks[i],
url = sprintf("",{ti}),
contents = open_download(ti&".htm",url)
integer start = match(`</div>`,contents,match(`<div class="infobox"`,contents))+length(`</div>`)
integer finish = match(`<div id="toc"`,contents,start)-1
-- ... but draft tasks with too few languages have no toc:
if finish=-1 then finish = match(`<h2>`,contents,start)-1 end if
-- ... and if no languages at all, use the footer:
if finish=-1 then finish = match(`</div><div class="printfooter">`,contents,start)-1 end if
if finish=-1 then ?9/0 end if
contents = contents[start..finish]
ti = substitute(ti,"_"," ")
if not match("<b>"&ti&"</b>",contents) then
-- (ps: I refuse to panic over the occasional replicated header...)
contents = sprintf("<h3>%s</h3>%s",{ti,contents})
end if
tasks[i] = contents
if get_key()=#1B then exit end if
end for
end if
if curl!=NULL then
curl = NULL
pErrorBuffer = NULL
end if
return tasks
end function
function html_clean(string ri)
ri = substitute(ri,"%3A",":")
ri = substitute(ri,"%E2%80%93","-")
ri = substitute(ri,"%E2%80%99","'")
ri = substitute(ri,"%27","'")
ri = substitute(ri,"%2B","+")
ri = substitute(ri,"%C3%A8","e")
ri = substitute(ri,"%C3%A9","e")
ri = substitute(ri,"%22","\"")
ri = substitute(ri,"%2A","*")
return ri
end function
constant html_header = """
<!DOCTYPE html>
<html lang="en">
<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>Rosettacode Tasks without examples</title>
<h2>Rosettacode Tasks without examples</h2>
Generated %s, %d entries<br><br>
html_footer = """
sequence results = extract_tasks()
if output_html then
integer fn = open("Tasks_Without_Examples.html","w")
for i=1 to length(results) do
end for
for i=1 to length(results) do
end for
end if
{} = wait_key()</lang>
