Rosetta Code/Find bare lang tags

From Rosetta Code
Revision as of 23:16, 5 December 2013 by rosettacode>AnonymousJohn (Added Haskell solution)
Task
Rosetta Code/Find bare lang tags
You are encouraged to solve this task according to the task description, using any language you may know.

Find all <lang> tags without a language specified in the text of a page. Display counts by language section:

Description

<lang>Pseudocode</lang>

=={{header|C}}==
<lang C>printf("Hello world!\n");</lang>

=={{header|Perl}}==
<lang>print "Hello world!\n"</lang>

should display something like

2 bare language tags.

1 in perl
1 in no language

For extra credit, allow multiple files to be read. Summarize all results by language:

5 bare language tags.

2 in c ([[Foo]], [[Bar]])
1 in perl ([[Foo]])
2 in no language ([[Baz]])

For more extra credit, use the Media Wiki API to test actual RC tasks.

AutoHotkey

This code has no syntax highlighting, because Rosetta Code's highlighter fails with code that contains literal </lang> tags.

Stole RegEx Needle from Perl

task =
(
Description

<lang>Pseudocode</lang>

=={{header|C}}==
<lang C>printf("Hello world!\n");</lang>

=={{header|Perl}}==
<lang>print "Hello world!\n"</lang>
)
lang := "no lanugage", out := Object(lang, 0), total := 0
Loop Parse, task, `r`n
	If RegExMatch(A_LoopField, "==\s*{{\s*header\s*\|\s*([^\s\}]+)\s*}}\s*==", $)
		lang := $1, out[lang] := 0
	else if InStr(A_LoopField, "<lang>")
		out[lang]++
For lang, num in Out
	If num
		total++, str .= "`n" num " in " lang
MsgBox % clipboard := total " bare lang tags.`n" . str

Output:

2 bare lang tags.

1 in no lanugage
1 in Perl

Erlang

<lang Erlang> -module( find_bare_lang_tags ).

-export( [task/0] ).

task() -> {ok, Binary} = file:read_file( "priv/find_bare_lang_tags_1" ), Lines = string:tokens( erlang:binary_to_list(Binary), "\n" ), {_Lang, Dict} = lists:foldl( fun count_empty_lang/2, {"no language", dict:new()}, Lines ), Count_langs = [{dict:fetch(X, Dict), X} || X <- dict:fetch_keys(Dict)], io:fwrite( "~p bare language tags.~n", [lists:sum([X || {X, _Y} <- Count_langs])] ), [io:fwrite( "~p in ~p~n", [X, Y] ) || {X, Y} <- Count_langs].


count_empty_lang( Line, {Lang, Dict} ) -> Empty_lang = string:str( Line, "<lang>" ), New_dict = dict_update_counter( Empty_lang, Lang, Dict ), New_lang = new_lang( string:str( Line,"==[[:Category:{{{1}}}|{{{1}}}]] [[Category:{{{1}}}]] Property "Implemented in language" (as page type) with input value "{{{1}}}" contains invalid characters or is incomplete and therefore can cause unexpected results during a query or annotation process.==" ), string:sub_string( Line, Start+1, Stop-1 ). </lang>

Output:
60>  find_bare_lang_tags:task().
2 bare language tags.
1 in "no language"
1 in "Perl"

Haskell

There are actually many different Regex packages available for Haskell. For this example, I chose TDFA, a very fast POSIX ERE engine. To change engines, simply change the import statement. If you use a Perl-style RE engine, you'll have to modify the expressions slightly.

This solution can be compiled into a program that will either take space-delimited list of files as its argument, or take input from STDIN if no arguments are provided. The Media Wiki API bonus is not attempted.

<lang Haskell>import System.Environment import Text.Printf import Text.Regex.TDFA import Data.List import Data.Array import qualified Data.Map as Map

splitByMatches :: String -> [MatchText String] -> [String] splitByMatches str matches = foldr (\match acc ->

   let before  =  take (matchOffset).head $ acc
       after  =  drop (matchOffset + matchLen).head $ acc
       matchOffset  =  fst.snd.(!0) $ match
       matchLen  =  snd.snd.(!0) $ match
   in  before:after:(tail acc)
   ) [str] matches

{-| Takes a string and splits it into the different languages used. All text

   before the language headers is put into the key "" -}

splitByLanguage :: String -> Map.Map String String splitByLanguage str = Map.fromList.zip langs $ splitByMatches str allMatches

   where langs  =  "":(map (fst.(!1)) allMatches)
         allMatches  =  matchAllText (makeRegex headerRegex :: Regex) str
         headerRegex  =  "==space:*{{space:*headerspace:*\\|space:*([^ }]*)space:*}}[^=]*=="

{-| Takes a string and counts the number of time a valid, but bare, lang tag

   appears. It does not attempt to ignore valid tags inside lang blocks. -}

countBareLangTags :: String -> Int countBareLangTags = matchCount (makeRegex "<langspace:*>" :: Regex)

main = do

   args <- getArgs
   (contents, files) <- if length args == 0 then do
       -- If there aren't arguments, read from stdin
           content  <-  getContents
           return ([content],[""])
       else if length args == 1 then do
       -- If there's only one argument, read the file, but don't display
       -- the filename in the results.
           content  <-  readFile (head args)
           return ([content],[""])
       else do
       -- Otherwise, read all the files and display their file names.
           contents  <-  mapM readFile args
           return (contents, args)
   let bareTagMaps  =  map (Map.map countBareLangTags.splitByLanguage) $ contents
   let tagsWithFiles  =  zipWith (\tags file -> Map.map (addFile file) tags) bareTagMaps files
   let allBareTags  =  foldl combineMaps Map.empty tagsWithFiles
   printBareTags allBareTags
       where addFile file count  =  (count, if count>0 && file/="" then [file] else [])
             combineMaps  =  Map.foldrWithKey insertItem
             insertItem  =  Map.insertWith (\(newC,newF) (oldC,oldF) -> (oldC+newC,oldF++newF))
       

printBareTags :: Map.Map String (Int,[String]) -> IO () printBareTags tags = do

   let numBare  =  Map.foldr ((+).fst) 0 tags
   printf "%d bare language tags:\n\n" numBare
   flip mapM_ (Map.toAscList tags) (\(lang,(count,files)) ->
       if count <= 0 then return () else printf "%d in %s%s\n" count (
       if lang == "" then "no language" else lang) (filesString files))

filesString :: [String] -> String filesString [] = "" filesString files = " ("++listString files++")"

   where listString [file]  =  ""++file++""
         listString (file:files)  =  ""++file++", "++listString files</lang>

Here are the input files I used to test:

example1.wiki
-------------------------------------------------------------
Description

<lang>Pseudocode</lang>

=={{header|C}}==
<lang C>printf("Hello world!\n");</lang>

=={{header|Perl}}==
<lang>print "Hello world!\n"</lang>
example2.wiki
-------------------------------------------------------------
Description

<lang>Pseudocode</lang>

=={{header|C}}==
<lang>printf("Hello world!\n");</lang>

=={{header|Perl}}==
<lang>print "Hello world!\n"</lang>
<lang Perl>print "Goodbye world!\n"</lang>

=={{header|Haskell}}==
<lang>hubris lang = "I'm so much better than a "++lang++" programmer because I program in Haskell."</lang>

And the output:

6 bare language tags:

2 in no language ([[example1.wiki]], [[example2.wiki]])
1 in C ([[example2.wiki]])
1 in Haskell ([[example2.wiki]])
2 in Perl ([[example1.wiki]], [[example2.wiki]])

Perl

This is a simple implementation that does not attempt either extra credit. <lang perl>my $lang = 'no language'; my $total = 0; my %blanks = (); while (<>) {

 if (m/<lang>/) {
   if (exists $blanks{lc $lang}) {
     $blanks{lc $lang}++
   } else {
     $blanks{lc $lang} = 1
   }
   $total++
 } elsif (m/==\s*Template:\s*header\s*\\s*==/) {
   $lang = lc $1
 }

}

if ($total) { print "$total bare language tag" . ($total > 1 ? 's' : ) . ".\n\n"; while ( my ($k, $v) = each(%blanks) ) { print "$k in $v\n" } }</lang>

Racket

Note that this follows the task, but the output is completely bogus since the actual <lang> tags that it finds are in <pre> and in code...

<lang racket>

  1. lang racket

(require net/url net/uri-codec json)

(define (get-text page)

 (define ((get k) x) (dict-ref x k))
 ((compose1 (get '*) car (get 'revisions) cdar hash->list (get 'pages)
            (get 'query) read-json get-pure-port string->url format)
  "http://rosettacode.org/mw/api.php?~a"
  (alist->form-urlencoded
   `([titles . ,page] [prop . "revisions"] [rvprop . "content"]
     [format . "json"] [action . "query"]))))

(define (find-bare-tags page)

 (define in (open-input-string (get-text page)))
 (define rx
   ((compose1 pregexp string-append)
    "<\\s*lang\\s*>|"
    "==\\s*\\{\\{\\s*header\\s*\\|\\s*([^{}]*?)\\s*\\}\\}\\s*=="))
 (let loop ([lang "no language"] [bare '()])
   (match (regexp-match rx in)
     [(list _ #f) (loop lang (dict-update bare lang add1 0))]
     [(list _ lang) (loop lang bare)]
     [#f (if (null? bare) (printf "no bare language tags\n")
             (begin (printf "~a bare language tags\n" (apply + (map cdr bare)))
                    (for ([b bare]) (printf "  ~a in ~a\n" (cdr b) (car b)))))])))

(find-bare-tags "Rosetta Code/Find bare lang tags") </lang>

Output:
8 bare language tags
  2 in no language
  4 in Perl
  1 in AutoHotkey
  1 in Tcl

More-extra credit

Add the following code at the bottom, run, watch results. <lang racket> (define (get-category cat)

 (let loop ([c #f])
   (define t
     ((compose1 read-json get-pure-port string->url format)
      "http://rosettacode.org/mw/api.php?~a"
      (alist->form-urlencoded
       `([list . "categorymembers"] [cmtitle . ,(format "Category:~a" cat)]
         [cmcontinue . ,(and c (dict-ref c 'cmcontinue))]
         [cmlimit . "500"] [format . "json"] [action . "query"]))))
   (define (c-m key) (dict-ref (dict-ref t key '()) 'categorymembers #f))
   (append (for/list ([page (c-m 'query)]) (dict-ref page 'title))
           (cond [(c-m 'query-continue) => loop] [else '()]))))

(for ([page (get-category "Programming Tasks")])

 (printf "Page: ~a " page)
 (find-bare-tags page))

</lang>

Tcl

For all the extra credit (note, takes a substantial amount of time due to number of HTTP requests):

Library: Tcllib (Package: json)
Library: Tcllib (Package: textutil::split)
Library: Tcllib (Package: uri)

<lang tcl>package require Tcl 8.5 package require http package require json package require textutil::split package require uri

proc getUrlWithRedirect {base args} {

   set url $base?[http::formatQuery {*}$args]
   while 1 {

set t [http::geturl $url] if {[http::status $t] ne "ok"} { error "Oops: url=$url\nstatus=$s\nhttp code=[http::code $token]" } if {[string match 2?? [http::ncode $t]]} { return $t } # OK, but not 200? Must be a redirect... set url [uri::resolve $url [dict get [http::meta $t] Location]] http::cleanup $t

   }

}

proc get_tasks {category} {

   global cache
   if {[info exists cache($category)]} {

return $cache($category)

   }
   set query [dict create cmtitle Category:$category]
   set tasks [list]

   while {1} {

set response [getUrlWithRedirect http://rosettacode.org/mw/api.php \ action query list categorymembers format json cmlimit 500 {*}$query]

# Get the data out of the message

       set data [json::json2dict [http::data $response]]
       http::cleanup $response

       # add tasks to list
       foreach task [dict get $data query categorymembers] {
           lappend tasks [dict get [dict create {*}$task] title]
       }

       if {[catch {

dict get $data query-continue categorymembers cmcontinue } continue_task]} then {

           # no more continuations, we're done
           break
       }
       dict set query cmcontinue $continue_task
   }
   return [set cache($category) $tasks]

} proc getTaskContent task {

   set token [getUrlWithRedirect http://rosettacode.org/mw/index.php \

title $task action raw]

   set content [http::data $token]
   http::cleanup $token
   return $content

}

proc init {} {

   global total count found
   set total 0
   array set count {}
   array set found {}

} proc findBareTags {pageName pageContent} {

   global total count found
   set t {{}}
   lappend t {*}[textutil::split::splitx $pageContent \

{==\s*\{\{\s*header\s*\|\s*([^{}]+?)\s*\}\}\s*==}]

   foreach {sectionName sectionText} $t {

set n [regexp -all {<lang>} $sectionText] if {!$n} continue incr count($sectionName) $n lappend found($sectionName) $pageName incr total $n

   }

} proc printResults {} {

   global total count found
   puts "$total bare language tags."
   if {$total} {

puts "" if {[info exists found()]} { puts "$count() in task descriptions\ (\[\[[join $found() {]], [[}]\]\])" unset found() } foreach sectionName [lsort -dictionary [array names found]] { puts "$count($sectionName) in $sectionName\ (\[\[[join $found($sectionName) {]], [[}]\]\])" }

   }

}

init set tasks [get_tasks Programming_Tasks]

  1. puts stderr "querying over [llength $tasks] tasks..."

foreach task [get_tasks Programming_Tasks] {

   #puts stderr "$task..."
   findBareTags $task [getTaskContent $task]

} printResults</lang>