Rosetta Code/Find bare lang tags: Difference between revisions
m (→{{header|Ruby}}: Hash.new takes 2 params, not three) |
|||
Line 425:
1 in PHP (["Greatest_subsequential_sum"])
</pre>
=={{header|Scala}}==
To analyse RosettaCode pages, invoke Java with <code>-Dhttp.agent=Anything</code> to work around CloudFlare blocking Java from accessing the RosettaCode site.
<lang Scala>// Map lines to a list of Option(heading -> task) for each bare lang tag found.
val headerFormat = "==[{]+header[|]([^}]*)[}]+==".r
val langFormat = "<lang([^>]*)>".r
def mapped(lines: Seq[String], taskName: String = "") = {
var heading = ""
for (line <- lines;
head = headerFormat.findFirstMatchIn(line).map(_ group 1);
lang = langFormat.findFirstMatchIn(line).map(_ group 1)) yield {
if (head.isDefined) heading = head.get
lang.map(_.trim).filter(_ == "").map(_ => heading -> taskName)
}
}
// Group results as a Map(heading -> task1, task2, ...)
def reduced(results: Seq[Option[(String,String)]]) =
results.flatten.groupBy(_._1).mapValues(_.unzip._2)
// Format each heading as "tasklist.size in heading (tasklist)"
def format(results: Map[String,Seq[String]]) = results.map{case (heading, tasks) =>
val h = if (heading.length > 0) heading else "no langauge"
val hmsg = s"${tasks.size} in $h"
val t = tasks.filterNot(_ == "")
val tmsg = if (t.isEmpty) "" else t.distinct.mkString(" (", ",", ")")
hmsg + tmsg
}
def count(results: Map[String,Seq[String]]) = results.values.map(_.size).sum
// Single and multi-source support
case class BareLangFinder(source: scala.io.Source, taskName: String = "") {
def map = mapped(source.getLines.toSeq, taskName)
def mapReduce = reduced(map)
def summary = format(mapReduce) mkString "\n"
}
def mapReduce(inputs: Seq[BareLangFinder]) = reduced(inputs.flatMap(_.map))</lang>
'''Examples:'''
<pre>val test = """
Description
<lang>Pseudocode</lang>
=={{header|C}}==
<lang C>printf("Hello world!\n");</lang>
=={{header|Perl}}==
<lang>print "Hello world!\n"</lang>
"""
println(BareLangFinder(scala.io.Source.fromString(test)).summary)
// System.setProperty("http.agent", "RosettaCode/1.0")
val tasks = List("Greatest_common_divisor", "Greatest_element_of_a_list", "Greatest_subsequential_sum")
val inputs = for (task <- tasks; url = "http://rosettacode.org/wiki?action=raw&title=" + task)
yield BareLangFinder(scala.io.Source.fromURL(url), task)
val bare = mapReduce(inputs)
println
println(s"${count(bare)} bare language tags in ${tasks.size} tasks:")
println(format(bare) mkString "\n")</pre>
{{out}}
<pre>1 in Perl
1 in no langauge
10 bare language tags in 3 tasks:
2 in Mathprog (Greatest_subsequential_sum)
1 in gnuplot (Greatest_common_divisor)
2 in МК-61/52 (Greatest_element_of_a_list)
1 in Bracmat (Greatest_element_of_a_list)
1 in PHP (Greatest_subsequential_sum)
2 in Euler Math Toolbox (Greatest_common_divisor,Greatest_element_of_a_list)
1 in ooRexx (Greatest_element_of_a_list)</pre>
=={{header|Tcl}}==
|
Revision as of 14:37, 17 October 2014
You are encouraged to solve this task according to the task description, using any language you may know.
Find all <lang> tags without a language specified in the text of a page. Display counts by language section:
Description <lang>Pseudocode</lang> =={{header|C}}== <lang C>printf("Hello world!\n");</lang> =={{header|Perl}}== <lang>print "Hello world!\n"</lang>
should display something like
2 bare language tags. 1 in perl 1 in no language
For extra credit, allow multiple files to be read. Summarize all results by language:
5 bare language tags. 2 in c ([[Foo]], [[Bar]]) 1 in perl ([[Foo]]) 2 in no language ([[Baz]])
For more extra credit, use the Media Wiki API to test actual RC tasks.
AutoHotkey
This code has no syntax highlighting, because Rosetta Code's highlighter fails with code that contains literal </lang> tags.
Stole RegEx Needle from Perl
task = ( Description <lang>Pseudocode</lang> =={{header|C}}== <lang C>printf("Hello world!\n");</lang> =={{header|Perl}}== <lang>print "Hello world!\n"</lang> ) lang := "no language", out := Object(lang, 0), total := 0 Loop Parse, task, `r`n If RegExMatch(A_LoopField, "==\s*{{\s*header\s*\|\s*([^\s\}]+)\s*}}\s*==", $) lang := $1, out[lang] := 0 else if InStr(A_LoopField, "<lang>") out[lang]++ For lang, num in Out If num total++, str .= "`n" num " in " lang MsgBox % clipboard := total " bare lang tags.`n" . str
Output:
2 bare lang tags. 1 in no language 1 in Perl
Erlang
<lang Erlang> -module( find_bare_lang_tags ).
-export( [task/0] ).
task() -> {ok, Binary} = file:read_file( "priv/find_bare_lang_tags_1" ), Lines = string:tokens( erlang:binary_to_list(Binary), "\n" ), {_Lang, Dict} = lists:foldl( fun count_empty_lang/2, {"no language", dict:new()}, Lines ), Count_langs = [{dict:fetch(X, Dict), X} || X <- dict:fetch_keys(Dict)], io:fwrite( "~p bare language tags.~n", [lists:sum([X || {X, _Y} <- Count_langs])] ), [io:fwrite( "~p in ~p~n", [X, Y] ) || {X, Y} <- Count_langs].
count_empty_lang( Line, {Lang, Dict} ) -> Empty_lang = string:str( Line, "<lang>" ), New_dict = dict_update_counter( Empty_lang, Lang, Dict ), New_lang = new_lang( string:str( Line,"==[[:Category:{{{1}}}|{{{1}}}]] [[Category:{{{1}}}]] Property "Implemented in language" (as page type) with input value "{{{1}}}" contains invalid characters or is incomplete and therefore can cause unexpected results during a query or annotation process.==" ), string:sub_string( Line, Start+1, Stop-1 ). </lang>
- Output:
60> find_bare_lang_tags:task(). 2 bare language tags. 1 in "no language" 1 in "Perl"
Haskell
There are actually many different Regex packages available for Haskell. For this example, I chose TDFA, a very fast POSIX ERE engine. To change engines, simply change the import statement. If you use a Perl-style RE engine, you'll have to modify the expressions slightly.
This solution can be compiled into a program that will either take space-delimited list of files as its argument, or take input from STDIN if no arguments are provided. Additionally, if you specify the -w flag in the first argument, it will take a list of Rosetta Code wiki pages and search them. Note that the page names must be as they appear in your URL bar -- underscores in place of spaces.
<lang Haskell>import System.Environment import Network.HTTP import Text.Printf import Text.Regex.TDFA import Data.List import Data.Array import qualified Data.Map as Map
{-| Takes a string and cuts out the text matched in the MatchText array. -} splitByMatches :: String -> [MatchText String] -> [String] splitByMatches str matches = foldr splitHead [str] matches
where splitHead match acc = before:after:(tail acc) where before = take (matchOffset).head$ acc after = drop (matchOffset + matchLen).head$ acc matchOffset = fst.snd.(!0)$ match matchLen = snd.snd.(!0)$ match
{-| Takes a string and counts the number of time a valid, but bare, lang tag
appears. It does not attempt to ignore valid tags inside lang blocks. -}
countBareLangTags :: String -> Int countBareLangTags = matchCount (makeRegex "<langspace:*>" :: Regex)
{-| Takes a string and counts the number of bare lang tags per section of the
text. All tags before the first section are put into the key "". -}
countByLanguage :: String -> Map.Map String Int countByLanguage str = Map.fromList.filter ((>0).snd)$ zip langs counts
where counts = map countBareLangTags.splitByMatches str$ allMatches langs = "":(map (fst.(!1)) allMatches) allMatches = matchAllText (makeRegex headerRegex :: Regex) str headerRegex = "==space:*{{space:*headerspace:*\\|space:*([^ }]*)space:*}}[^=]*=="
main = do
args <- getArgs (contents, files) <- if length args == 0 then do -- If there aren't arguments, read from stdin content <- getContents return ([content],[""]) else if length args == 1 then do -- If there's only one argument, read the file, but don't display -- the filename in the results. content <- readFile (head args) return ([content],[""]) else if (args !! 0) == "-w" then do -- If there's more than one argument and the first one is the -w option, -- use the rest of the arguments as page titles and load them from the wiki. contents <- mapM getPageContent.tail$ args return (contents, if length args > 2 then tail args else [""]) else do -- Otherwise, read all the files and display their file names. contents <- mapM readFile args return (contents, args) let tagsPerLang = map countByLanguage contents let tagsWithFiles = zipWith addFileToTags files tagsPerLang let combinedFiles = Map.unionsWith combine tagsWithFiles printBareTags combinedFiles where addFileToTags file = Map.map (flip (,) [file]) combine cur next = (fst cur + fst next, snd cur ++ snd next)
printBareTags :: Map.Map String (Int,[String]) -> IO () printBareTags tags = do
let numBare = Map.foldr ((+).fst) 0 tags printf "%d bare language tags:\n\n" numBare mapM_ (\(lang,(count,files)) -> printf "%d in %s%s\n" count (if lang == "" then "no language" else lang) (filesString files) ) (Map.toAscList tags)
filesString :: [String] -> String filesString [] = "" filesString ("":rest) = filesString rest filesString files = " ("++listString files++")"
where listString [file] = ""++file++"" listString (file:files) = ""++file++", "++listString files
getPageContent :: String -> IO String getPageContent title = do
response <- simpleHTTP.getRequest$ url getResponseBody response where url = "http://rosettacode.org/mw/index.php?action=raw&title="++title</lang>
Here are the input files I used to test:
example1.wiki ------------------------------------------------------------- Description <lang>Pseudocode</lang> =={{header|C}}== <lang C>printf("Hello world!\n");</lang> =={{header|Perl}}== <lang>print "Hello world!\n"</lang>
example2.wiki ------------------------------------------------------------- Description <lang>Pseudocode</lang> =={{header|C}}== <lang>printf("Hello world!\n");</lang> =={{header|Perl}}== <lang>print "Hello world!\n"</lang> <lang Perl>print "Goodbye world!\n"</lang> =={{header|Haskell}}== <lang>hubris lang = "I'm so much better than a "++lang++" programmer because I program in Haskell."</lang>
And the output:
6 bare language tags: 2 in no language ([[example1.wiki]], [[example2.wiki]]) 1 in C ([[example2.wiki]]) 1 in Haskell ([[example2.wiki]]) 2 in Perl ([[example1.wiki]], [[example2.wiki]])
Additionally, I tested with 100_doors and Huffman_coding. The following resulted:
5 bare language tags: 1 in no language ([[100_doors]]) 1 in C ([[Huffman_coding]]) 1 in CoffeeScript ([[Huffman_coding]]) 1 in Perl ([[Huffman_coding]]) 1 in PostScript ([[100_doors]])
Icon and Unicon
The following is a Unicon-specific solution. <lang unicon>import Utils # To get the FindFirst class
procedure main()
keys := ["{{header|","<lang>"] lang := "No language" tags := table(0) total := 0
ff := FindFirst(keys) f := reads(&input, -1)
f ? while tab(ff.locate()) do {
if "[[:Category:{{{1}}}|{{{1}}}]] [[Category:{{{1}}}]] Property "Implemented in language" (as page type) with input value "{{{1}}}" contains invalid characters or is incomplete and therefore can cause unexpected results during a query or annotation process.")))
else (tags[lang] +:= 1, total +:= 1)
}
write(total," bare language tags:\n") every pair := !sort(tags) do write(pair[2]," in ",pair[1])
end</lang>
Sample run using example given in problem statement:
->rcfblt <rcfblt.in 2 bare language tags: 1 in No language 1 in perl ->
Perl
This is a simple implementation that does not attempt either extra credit. <lang perl>my $lang = 'no language'; my $total = 0; my %blanks = (); while (<>) {
if (m/<lang>/) { if (exists $blanks{lc $lang}) { $blanks{lc $lang}++ } else { $blanks{lc $lang} = 1 } $total++ } elsif (m/==\s*Template:\s*header\s*\\s*==/) { $lang = lc $1 }
}
if ($total) { print "$total bare language tag" . ($total > 1 ? 's' : ) . ".\n\n"; while ( my ($k, $v) = each(%blanks) ) { print "$k in $v\n" } }</lang>
Perl 6
The only tricky thing here is the use of the ms form of match, short for m:sigspace. This causes whitespace in the regex to be considered "significant", that is, it matches optional whitespace at those positions, as if you'd put \s* there. Of course, the regexes themselves are in Perl 6 syntax, which is quite different from Perl 5 regex syntax (and arguably much cleaner). Regex syntax is perhaps the area in which Perl 6 diverges most from Perl 5. <lang perl6>my $lang = '(no language)'; my $total = 0; my %blanks;
for lines() {
when / '<lang>' / { %blanks{$lang}++; $total++; } when ms/ '==' 'Template:' 'header' '' '==' / { $lang = $0.lc; }
}
say "$total bare language tag{ 's' if $total != 1 }\n"; say .value, ' in ', .key for %blanks.sort;</lang>
- Output:
2 bare language tags 1 in (no language) 1 in perl
Racket
Note that this follows the task, but the output is completely bogus since the actual <lang> tags that it finds are in <pre> and in code...
<lang racket>
- lang racket
(require net/url net/uri-codec json)
(define (get-text page)
(define ((get k) x) (dict-ref x k)) ((compose1 (get '*) car (get 'revisions) cdar hash->list (get 'pages) (get 'query) read-json get-pure-port string->url format) "http://rosettacode.org/mw/api.php?~a" (alist->form-urlencoded `([titles . ,page] [prop . "revisions"] [rvprop . "content"] [format . "json"] [action . "query"]))))
(define (find-bare-tags page)
(define in (open-input-string (get-text page))) (define rx ((compose1 pregexp string-append) "<\\s*lang\\s*>|" "==\\s*\\{\\{\\s*header\\s*\\|\\s*([^{}]*?)\\s*\\}\\}\\s*==")) (let loop ([lang "no language"] [bare '()]) (match (regexp-match rx in) [(list _ #f) (loop lang (dict-update bare lang add1 0))] [(list _ lang) (loop lang bare)] [#f (if (null? bare) (printf "no bare language tags\n") (begin (printf "~a bare language tags\n" (apply + (map cdr bare))) (for ([b bare]) (printf " ~a in ~a\n" (cdr b) (car b)))))])))
(find-bare-tags "Rosetta Code/Find bare lang tags") </lang>
- Output:
8 bare language tags 2 in no language 4 in Perl 1 in AutoHotkey 1 in Tcl
More-extra credit
Add the following code at the bottom, run, watch results. <lang racket> (define (get-category cat)
(let loop ([c #f]) (define t ((compose1 read-json get-pure-port string->url format) "http://rosettacode.org/mw/api.php?~a" (alist->form-urlencoded `([list . "categorymembers"] [cmtitle . ,(format "Category:~a" cat)] [cmcontinue . ,(and c (dict-ref c 'cmcontinue))] [cmlimit . "500"] [format . "json"] [action . "query"])))) (define (c-m key) (dict-ref (dict-ref t key '()) 'categorymembers #f)) (append (for/list ([page (c-m 'query)]) (dict-ref page 'title)) (cond [(c-m 'query-continue) => loop] [else '()]))))
(for ([page (get-category "Programming Tasks")])
(printf "Page: ~a " page) (find-bare-tags page))
</lang>
Ruby
Quoting from the FAQ: "If you just want the raw wikitext without any other information whatsoever, it's best to use index.php's action=raw mode instead of the API" <lang Ruby>require "open-uri" require "cgi"
tasks = ["Greatest_common_divisor", "Greatest_element_of_a_list", "Greatest_subsequential_sum"] part_uri = "http://rosettacode.org/wiki?action=raw&title=" Report = Struct.new(:count, :tasks) result = Hash.new{|h,k| h[k] = Report.new(0, [])}
tasks.each do |task|
puts "processing #{task}" current_lang = "no language" open(part_uri + CGI.escape(task)).each_line do |line| current_lang = Regexp.last_match["lang"] if /==\{\{header\|(?<lang>.+)\}\}==/ =~ line num_no_langs = line.scan(/<lang\s*>/).size if num_no_langs > 0 then result[current_lang].count += num_no_langs result[current_lang].tasks << task end end
end
puts "\n#{result.values.map(&:count).inject(&:+)} bare language tags.\n\n" result.each{|k,v| puts "#{v.count} in #{k} (#{v.tasks})"}</lang>
- Output:
processing Greatest_common_divisor processing Greatest_element_of_a_list processing Greatest_subsequential_sum 10 bare language tags. 2 in Euler Math Toolbox (["Greatest_common_divisor", "Greatest_element_of_a_list"]) 1 in gnuplot (["Greatest_common_divisor"]) 1 in Bracmat (["Greatest_element_of_a_list"]) 2 in МК-61/52 (["Greatest_element_of_a_list", "Greatest_element_of_a_list"]) 1 in ooRexx (["Greatest_element_of_a_list"]) 2 in Mathprog (["Greatest_subsequential_sum", "Greatest_subsequential_sum"]) 1 in PHP (["Greatest_subsequential_sum"])
Scala
To analyse RosettaCode pages, invoke Java with -Dhttp.agent=Anything
to work around CloudFlare blocking Java from accessing the RosettaCode site.
<lang Scala>// Map lines to a list of Option(heading -> task) for each bare lang tag found.
val headerFormat = "==[{]+header[|]([^}]*)[}]+==".r
val langFormat = "<lang([^>]*)>".r
def mapped(lines: Seq[String], taskName: String = "") = {
var heading = "" for (line <- lines; head = headerFormat.findFirstMatchIn(line).map(_ group 1); lang = langFormat.findFirstMatchIn(line).map(_ group 1)) yield { if (head.isDefined) heading = head.get lang.map(_.trim).filter(_ == "").map(_ => heading -> taskName) }
} // Group results as a Map(heading -> task1, task2, ...) def reduced(results: Seq[Option[(String,String)]]) =
results.flatten.groupBy(_._1).mapValues(_.unzip._2)
// Format each heading as "tasklist.size in heading (tasklist)" def format(results: Map[String,Seq[String]]) = results.map{case (heading, tasks) =>
val h = if (heading.length > 0) heading else "no langauge" val hmsg = s"${tasks.size} in $h" val t = tasks.filterNot(_ == "") val tmsg = if (t.isEmpty) "" else t.distinct.mkString(" (", ",", ")") hmsg + tmsg
} def count(results: Map[String,Seq[String]]) = results.values.map(_.size).sum
// Single and multi-source support case class BareLangFinder(source: scala.io.Source, taskName: String = "") {
def map = mapped(source.getLines.toSeq, taskName) def mapReduce = reduced(map) def summary = format(mapReduce) mkString "\n"
} def mapReduce(inputs: Seq[BareLangFinder]) = reduced(inputs.flatMap(_.map))</lang> Examples:
val test = """ Description <lang>Pseudocode</lang> =={{header|C}}== <lang C>printf("Hello world!\n");</lang> =={{header|Perl}}== <lang>print "Hello world!\n"</lang> """ println(BareLangFinder(scala.io.Source.fromString(test)).summary) // System.setProperty("http.agent", "RosettaCode/1.0") val tasks = List("Greatest_common_divisor", "Greatest_element_of_a_list", "Greatest_subsequential_sum") val inputs = for (task <- tasks; url = "http://rosettacode.org/wiki?action=raw&title=" + task) yield BareLangFinder(scala.io.Source.fromURL(url), task) val bare = mapReduce(inputs) println println(s"${count(bare)} bare language tags in ${tasks.size} tasks:") println(format(bare) mkString "\n")
- Output:
1 in Perl 1 in no langauge 10 bare language tags in 3 tasks: 2 in Mathprog (Greatest_subsequential_sum) 1 in gnuplot (Greatest_common_divisor) 2 in МК-61/52 (Greatest_element_of_a_list) 1 in Bracmat (Greatest_element_of_a_list) 1 in PHP (Greatest_subsequential_sum) 2 in Euler Math Toolbox (Greatest_common_divisor,Greatest_element_of_a_list) 1 in ooRexx (Greatest_element_of_a_list)
Tcl
For all the extra credit (note, takes a substantial amount of time due to number of HTTP requests):
<lang tcl>package require Tcl 8.5 package require http package require json package require textutil::split package require uri
proc getUrlWithRedirect {base args} {
set url $base?[http::formatQuery {*}$args] while 1 {
set t [http::geturl $url] if {[http::status $t] ne "ok"} { error "Oops: url=$url\nstatus=$s\nhttp code=[http::code $token]" } if {[string match 2?? [http::ncode $t]]} { return $t } # OK, but not 200? Must be a redirect... set url [uri::resolve $url [dict get [http::meta $t] Location]] http::cleanup $t
}
}
proc get_tasks {category} {
global cache if {[info exists cache($category)]} {
return $cache($category)
} set query [dict create cmtitle Category:$category] set tasks [list] while {1} {
set response [getUrlWithRedirect http://rosettacode.org/mw/api.php \ action query list categorymembers format json cmlimit 500 {*}$query]
# Get the data out of the message
set data [json::json2dict [http::data $response]] http::cleanup $response # add tasks to list foreach task [dict get $data query categorymembers] { lappend tasks [dict get [dict create {*}$task] title] } if {[catch {
dict get $data query-continue categorymembers cmcontinue } continue_task]} then {
# no more continuations, we're done break } dict set query cmcontinue $continue_task } return [set cache($category) $tasks]
} proc getTaskContent task {
set token [getUrlWithRedirect http://rosettacode.org/mw/index.php \
title $task action raw]
set content [http::data $token] http::cleanup $token return $content
}
proc init {} {
global total count found set total 0 array set count {} array set found {}
} proc findBareTags {pageName pageContent} {
global total count found set t {{}} lappend t {*}[textutil::split::splitx $pageContent \
{==\s*\{\{\s*header\s*\|\s*([^{}]+?)\s*\}\}\s*==}]
foreach {sectionName sectionText} $t {
set n [regexp -all {<lang>} $sectionText] if {!$n} continue incr count($sectionName) $n lappend found($sectionName) $pageName incr total $n
}
} proc printResults {} {
global total count found puts "$total bare language tags." if {$total} {
puts "" if {[info exists found()]} { puts "$count() in task descriptions\ (\[\[[join $found() {]], [[}]\]\])" unset found() } foreach sectionName [lsort -dictionary [array names found]] { puts "$count($sectionName) in $sectionName\ (\[\[[join $found($sectionName) {]], [[}]\]\])" }
}
}
init set tasks [get_tasks Programming_Tasks]
- puts stderr "querying over [llength $tasks] tasks..."
foreach task [get_tasks Programming_Tasks] {
#puts stderr "$task..." findBareTags $task [getTaskContent $task]
} printResults</lang>