Syntax highlighting using Mediawiki formatting: Difference between revisions
m (Fixed typo in task description.) |
(→{{header|ALGOL 68}}: Much simpler parsing (as in other samples), fixed multi-line comments) |
||
Line 35: | Line 35: | ||
Handles upper-stropping Algol 68 sources (as used by ALGOL 68G and most other compilers). |
Handles upper-stropping Algol 68 sources (as used by ALGOL 68G and most other compilers). |
||
'' |
''CO Convert an upper-stropped Algol 68 source to "wiki" format'' |
||
'' |
'' each line is preceeded by a space,'' |
||
'' |
'' bold words are enclosed in ''' and ''' and comments in '' and '''' |
||
'' |
'' ', &, < and > are converted to &apos; &amp; &lt; and &gt;'' |
||
'' |
'' everything else if output as is'' |
||
'' quote-stropping, point-stropping and res-stropping is not suppoered'' |
|||
''# the source is read from stand in and written to stand out #'' |
|||
'' the source is read from stand in and written to stand out'' |
|||
''# the last line in the file must end with a newline #'' |
|||
'' the last line in the file must end with a newline'' |
|||
''# { and } are assumed to be alternatives for ( and ), if { } should be #'' |
|||
'' { and } are assumed to be alternatives for ( and ), if { } should be'' |
|||
''# treated as comments ( as in ALGOL68RS/algol68toc ) #'' |
|||
'' |
'' treated as comments ( as in ALGOL68RS/algol68toc )'' |
||
'' change rs style brief comments to TRUE'' |
|||
''CO'' |
|||
'''BEGIN''' |
'''BEGIN''' |
||
'''BOOL''' in string := '''FALSE'''; |
|||
'''BOOL''' in brief comment := '''FALSE'''; |
|||
'''INT''' rs comment depth := 0; |
|||
'''STRING''' comment delimiter := ""; |
|||
''# TRUE if {} delimits a nestable brief comment, as in ALGOL 68RS and #'' |
''# TRUE if {} delimits a nestable brief comment, as in ALGOL 68RS and #'' |
||
Line 59: | Line 66: | ||
); |
); |
||
'''CHAR''' nl = '''REPR''' 10; ''# newline character #'' |
'''CHAR''' nl = '''REPR''' 10; ''# newline character #'' |
||
'''INT''' error count := 0; ''# number of errors reported #'' |
|||
'''STRING''' line := nl; ''# current source line #'' |
'''STRING''' line := nl; ''# current source line #'' |
||
'''INT''' pos := '''LWB''' line; ''# current position in line #'' |
'''INT''' pos := '''LWB''' line; ''# current position in line #'' |
||
'''CHAR''' c := " "; ''# current source character #'' |
'''CHAR''' c := " "; ''# current source character #'' |
||
'''PROC''' error = ( '''STRING''' message )'''VOID''': ''# reports an error #'' |
|||
'''BEGIN''' |
|||
error count +:= 1; |
|||
print( ( newline, newline, "**** ", message, newline ) ) |
|||
'''END''' ''# error #'' ; |
|||
''# reports an unterminated construct ( e.g. string, comment ) #'' |
|||
'''PROC''' unterminated = ( '''STRING''' construct )'''VOID''': error( "Unterminated " + construct ); |
|||
'''PROC''' next char = '''VOID''': ''# gets the next source character, stores it in c #'' |
'''PROC''' next char = '''VOID''': ''# gets the next source character, stores it in c #'' |
||
'''IF''' pos <= '''UPB''' line '''THEN''' |
'''IF''' pos <= '''UPB''' line '''THEN''' |
||
Line 81: | Line 80: | ||
'''THEN''' |
'''THEN''' |
||
line +:= nl; ''# have another line #'' |
line +:= nl; ''# have another line #'' |
||
c := line[ pos := '''LWB''' line ]; |
|||
c := line[ pos ]; |
|||
pos +:= 1 |
pos +:= 1 |
||
'''ELSE''' |
'''ELSE''' |
||
Line 89: | Line 87: | ||
'''FI''' ''# next char #'' ; |
'''FI''' ''# next char #'' ; |
||
'''PROC''' out char = ( '''CHAR''' ch )'''VOID''': ''# conveerts and outputs ch #'' |
'''PROC''' out char = ( '''CHAR''' ch )'''VOID''': ''# conveerts and outputs ch #'' |
||
'''IF''' ch = nl '''THEN''' |
'''IF''' ch = nl '''THEN''' |
||
'''IF''' '''NOT''' in brief comment '''AND''' rs comment depth = 0 '''AND''' comment delimiter = "" '''THEN''' |
|||
print( ( newline, " " ) ) ''# newline not in a comment #'' |
|||
'''ELSE''' ''# newline in a comment #'' |
|||
italic delimiter; print( ( newline, " " ) ); italic delimiter |
|||
'''FI''' |
|||
'''ELIF''' ch = "<" '''THEN''' print( ( "&lt;" ) ) |
'''ELIF''' ch = "<" '''THEN''' print( ( "&lt;" ) ) |
||
'''ELIF''' ch = ">" '''THEN''' print( ( "&gt;" ) ) |
'''ELIF''' ch = ">" '''THEN''' print( ( "&gt;" ) ) |
||
Line 96: | Line 99: | ||
'''ELSE''' print( ch ) |
'''ELSE''' print( ch ) |
||
'''FI''' ''# out char #'' ; |
'''FI''' ''# out char #'' ; |
||
''# outputs the current character and gets the next #'' |
|||
'''PROC''' out and next char = '''VOID''': '''BEGIN''' out char( c ); next char '''END'''; |
|||
''# outputs a wiki start/end italic delimiter #'' |
''# outputs a wiki start/end italic delimiter #'' |
||
'''PROC''' italic delimiter = '''VOID''': print( ( "''" ) ); |
'''PROC''' italic delimiter = '''VOID''': print( ( "''" ) ); |
||
''# outputs a wiki start/end bold delimiter #'' |
|||
'''PROC''' bold delimiter = '''VOID''': print( ( "'''" ) ); |
|||
''# returns TRUE if the current character is a string delimiter #'' |
|||
'''PROC''' have string delimiter = '''BOOL''': c = """"; |
|||
''# returns TRUE if the current character can start a bold word #'' |
''# returns TRUE if the current character can start a bold word #'' |
||
'''PROC''' have bold = '''BOOL''': c >= "A" '''AND''' c <= "Z"; |
'''PROC''' have bold = '''BOOL''': c >= "A" '''AND''' c <= "Z"; |
||
''# outputs a brief comment to stand out #'' |
|||
''# end char is the closing delimiter, #'' |
|||
''# nested char is the opening delimiter for nestable brief comments #'' |
|||
''# if nested char is blank, the brief comment does not nest #'' |
|||
''# this handles ALGOL 68RS and algol68toc style {} comments #'' |
|||
'''PROC''' copy brief comment = ( '''CHAR''' end char, '''CHAR''' nested char )'''VOID''': |
|||
'''BEGIN''' |
|||
out char( c ); |
|||
'''WHILE''' next char; |
|||
'''NOT''' at eof '''AND''' c /= end char |
|||
'''DO''' |
|||
'''IF''' c = nested char '''AND''' nested char /= " " '''THEN''' |
|||
''# nested brief comment #'' |
|||
copy brief comment( end char, nested char ) |
|||
'''ELSE''' |
|||
''# notmal comment char #'' |
|||
out char( c ) |
|||
'''FI''' |
|||
'''OD'''; |
|||
'''IF''' at eof '''THEN''' |
|||
''# unterminated comment #'' |
|||
unterminated( """" + end char + """ comment" ); |
|||
c := end char |
|||
'''FI'''; |
|||
out char( c ); |
|||
next char |
|||
'''END''' ''# copy brief comment #'' ; |
|||
'''PROC''' copy string = '''VOID''': ''# outputs a string denotation from the source #'' |
|||
'''WHILE''' have string delimiter '''DO''' ''# within a string denotation, #'' |
|||
'''WHILE''' out char( c ); ''# "" denotes the " character #'' |
|||
next char; |
|||
'''NOT''' at eof '''AND''' '''NOT''' have string delimiter |
|||
'''DO''' '''SKIP''' '''OD'''; |
|||
'''IF''' '''NOT''' have string delimiter '''THEN''' |
|||
unterminated( "string" ); |
|||
c := """" |
|||
'''FI'''; |
|||
out char( c ); |
|||
next char |
|||
'''OD''' ''# copy string #'' ; |
|||
'''PROC''' get bold word = '''STRING''': ''# gets a bold word from then source #'' |
'''PROC''' get bold word = '''STRING''': ''# gets a bold word from then source #'' |
||
'''BEGIN''' |
'''BEGIN''' |
||
Line 150: | Line 111: | ||
result |
result |
||
'''END''' ''# get bold word #'' ; |
'''END''' ''# get bold word #'' ; |
||
'''PROC''' copy to bold = '''STRING''': ''# copies the source to the output #'' |
|||
'''IF''' at eof ''# until a bold word is encountered #'' |
|||
'''THEN''' "" |
|||
'''ELSE''' '''STRING''' result := ""; |
|||
'''WHILE''' out char( c ); |
|||
next char; |
|||
'''NOT''' at eof |
|||
'''AND''' '''NOT''' have bold |
|||
'''DO''' '''SKIP''' '''OD'''; |
|||
'''IF''' '''NOT''' at eof '''THEN''' result := get bold word '''FI'''; |
|||
result |
|||
'''FI''' ''# copy to bold #'' ; |
|||
'''PROC''' bold word or comment = '''VOID''': ''# handles a bold COMMENT #'' |
|||
'''IF''' '''STRING''' bold word := get bold word; ''# or other bold word #'' |
|||
bold word = "CO" '''OR''' bold word = "COMMENT" |
|||
'''THEN''' |
|||
italic delimiter; ''# have a bold comment #'' |
|||
'''STRING''' delimiter = bold word; |
|||
'''WHILE''' print( ( bold word ) ); |
|||
bold word := copy to bold; |
|||
'''NOT''' at eof |
|||
'''AND''' bold word /= delimiter |
|||
'''DO''' '''SKIP''' '''OD'''; |
|||
'''IF''' at eof '''THEN''' |
|||
unterminated( """" + delimiter + """ comment" ) |
|||
'''FI'''; |
|||
print( ( delimiter ) ); |
|||
italic delimiter |
|||
'''ELSE''' ''# some other bold word #'' |
|||
bold delimiter; |
|||
print( ( bold word ) ); |
|||
bold delimiter |
|||
'''FI''' ''# bold word or comment #'' ; |
|||
''# copy the source to stand out, conveerting to wiki format #'' |
''# copy the source to stand out, conveerting to wiki format #'' |
||
next char; |
next char; |
||
'''WHILE''' '''NOT''' at eof '''DO''' |
'''WHILE''' '''NOT''' at eof '''DO''' |
||
'''IF''' |
'''IF''' in string '''THEN''' ''# currently in a string #'' |
||
in string := c /=""""; |
|||
out and next char |
|||
'''ELIF''' in brief comment '''THEN''' ''# currently in a brief comment #'' |
|||
in brief comment := c /= "#"; |
|||
out and next char; |
|||
'''IF''' '''NOT''' in brief comment '''THEN''' italic delimiter '''FI''' |
|||
'''ELIF''' rs comment depth > 0 '''THEN''' ''# currently in a nesting {...} comment #'' |
|||
'''IF''' c = "}" '''THEN''' rs comment depth -:= 1 '''FI'''; |
|||
out and next char; |
|||
'''IF''' rs comment depth < 1 '''THEN''' italic delimiter '''FI''' |
|||
'''ELIF''' comment delimiter /= "" '''THEN''' ''# in a CO/COMMENT comment #'' |
|||
'''IF''' '''NOT''' have bold '''THEN''' |
|||
out and next char ''# haven't reached a bold word #'' |
|||
'''ELSE''' |
|||
'''STRING''' word = get bold word; ''# at the start of a bold word #'' |
|||
print( ( word ) ); |
|||
'''IF''' word = comment delimiter '''THEN''' |
|||
''# reached the end of the comment #'' |
|||
italic delimiter; |
|||
comment delimiter := "" |
|||
'''FI''' |
|||
'''FI''' |
|||
'''ELIF''' c = """" '''THEN''' ''# start of a string or character denotation #'' |
|||
out and next char; |
|||
in string := '''TRUE''' |
|||
'''ELIF''' c = "#" '''THEN''' ''# start of a brief comment such as this one #'' |
|||
italic delimiter; |
italic delimiter; |
||
out and next char; |
|||
in brief comment := '''TRUE''' |
|||
'''ELIF''' c = "{" '''AND''' rs style brief comments '''THEN''' |
'''ELIF''' c = "{" '''AND''' rs style brief comments '''THEN''' ''# nestable brief #'' |
||
''# |
italic delimiter; ''# comment ( ALGOL 68RS and algol68toc ) #'' |
||
out and next char; |
|||
rs comment depth := 1 |
|||
italic delimiter |
|||
'''ELIF''' have string delimiter '''THEN''' ''# STRING or CHAR denotation #'' |
|||
copy string |
|||
'''ELIF''' have bold '''THEN''' ''# have a bold word #'' |
'''ELIF''' have bold '''THEN''' ''# have a bold word #'' |
||
'''STRING''' word = get bold word; |
|||
''' |
'''IF''' word /= "CO" '''AND''' word /= "COMMENT" '''THEN''' |
||
print( ( "'''", word, "'''" ) ) ''# non-comment bold word #'' |
|||
'''ELSE''' |
|||
italic delimiter; ''# start of a bold comment #'' |
|||
next char |
|||
print( ( word ) ); |
|||
comment delimiter := word |
|||
'''FI''' |
|||
'''ELSE''' ''# anything else #'' |
|||
out and next char |
|||
'''FI''' |
'''FI''' |
||
'''OD'''; |
'''OD'''; |
||
'''IF''' in string '''THEN''' print( ( "**** unterminated string", newline ) ) |
|||
''' |
'''ELIF''' in brief comment '''THEN''' print( ( "**** unterminated brief comment", newline ) ) |
||
'''ELIF''' rs comment depth > 0 '''THEN''' print( ( "**** unterminated {...} comment", newline ) ) |
|||
print( ( "**** ", |
'''ELIF''' comment delimiter /= "" '''THEN''' print( ( "**** unterminated ", comment delimiter, newline ) ) |
||
'''FI''' |
'''FI''' |
||
Revision as of 20:44, 10 October 2023
- Introduction
When formatting a page for display, Mediawiki allows the page to include bold and italic text by placing the bold/italic text within paired repeated-single quote characters - 3 single quotes for bold and 2 for italic, 5 for bold italic.
E.g.:
'''bold-word''' and ''italic-word'' appears as bold-word and italic-word.
This could be used to provide simple syntax-highlighting without the use of the relatively more expensive <syntaxhighlight> tags or for languages not currently supported by Pygments.
A few languages on Rosetta Code are currently using schemes like this.
- Task
The task is to write a syntax highlighter that given a source in your language will output a wiki formatted version of the source with the keywords/reserved words in bold and the comments in italics.
Note that each source line (including blank lines) should be output with a leading space, to ensure the source is treated as a single block.
Additionally, translate the following characters:
- single-quote (') to '
- ampersand (&) to &
- less-than (<) to <
- greater-than (>) to >
If your language doesn't have keywords/reserved words or comments, use your judgement on what to highlight in bold or italic : )
- Presenting your source
Instead of showing your source within syntaxhighlight tags and having a separate output block, just show the source that would be output from your program when given its own source to process.
I.e., don't use syntaxhighlight tags.
- See also
https://www.mediawiki.org/wiki/Help:Formatting
ALGOL 68
Handles upper-stropping Algol 68 sources (as used by ALGOL 68G and most other compilers).
CO Convert an upper-stropped Algol 68 source to "wiki" format each line is preceeded by a space, bold words are enclosed in ''' and ''' and comments in '' and '' ', &, < and > are converted to ' & < and > everything else if output as is quote-stropping, point-stropping and res-stropping is not suppoered the source is read from stand in and written to stand out the last line in the file must end with a newline { and } are assumed to be alternatives for ( and ), if { } should be treated as comments ( as in ALGOL68RS/algol68toc ) change rs style brief comments to TRUE CO BEGIN BOOL in string := FALSE; BOOL in brief comment := FALSE; INT rs comment depth := 0; STRING comment delimiter := ""; # TRUE if {} delimits a nestable brief comment, as in ALGOL 68RS and # # algol68toc, FALSE if {} are alternatives to () as in ALGOL 68G # BOOL rs style brief comments = FALSE; BOOL at eof := FALSE; # TRUE if EOF has been reached, FALSE otherwise # on logical file end( stand in # set EOF handler for stand in # , ( REF FILE f )BOOL: # note that we reached EOF on the latest read # # and return TRUE so processing can continue # at eof := TRUE ); CHAR nl = REPR 10; # newline character # STRING line := nl; # current source line # INT pos := LWB line; # current position in line # CHAR c := " "; # current source character # PROC next char = VOID: # gets the next source character, stores it in c # IF pos <= UPB line THEN c := line[ pos ]; # not past the end of the source line # pos +:= 1 ELIF # past the end of the current source line - get the next # at eof := FALSE; read( ( line, newline ) ); NOT at eof THEN line +:= nl; # have another line # c := line[ pos := LWB line ]; pos +:= 1 ELSE line := ""; # reached eof # c := REPR 0 FI # next char # ; PROC out char = ( CHAR ch )VOID: # conveerts and outputs ch # IF ch = nl THEN IF NOT in brief comment AND rs comment depth = 0 AND comment delimiter = "" THEN print( ( newline, " " ) ) # newline not in a comment # ELSE # newline in a comment # italic delimiter; print( ( newline, " " ) ); italic delimiter FI ELIF ch = "<" THEN print( ( "<" ) ) ELIF ch = ">" THEN print( ( ">" ) ) ELIF ch = "&" THEN print( ( "&" ) ) ELIF ch = "'" THEN print( ( "'" ) ) ELSE print( ch ) FI # out char # ; # outputs the current character and gets the next # PROC out and next char = VOID: BEGIN out char( c ); next char END; # outputs a wiki start/end italic delimiter # PROC italic delimiter = VOID: print( ( "''" ) ); # returns TRUE if the current character can start a bold word # PROC have bold = BOOL: c >= "A" AND c <= "Z"; PROC get bold word = STRING: # gets a bold word from then source # BEGIN STRING result := ""; WHILE have bold OR c = "_" DO result +:= c; next char OD; result END # get bold word # ; # copy the source to stand out, conveerting to wiki format # next char; WHILE NOT at eof DO IF in string THEN # currently in a string # in string := c /=""""; out and next char ELIF in brief comment THEN # currently in a brief comment # in brief comment := c /= "#"; out and next char; IF NOT in brief comment THEN italic delimiter FI ELIF rs comment depth > 0 THEN # currently in a nesting {...} comment # IF c = "}" THEN rs comment depth -:= 1 FI; out and next char; IF rs comment depth < 1 THEN italic delimiter FI ELIF comment delimiter /= "" THEN # in a CO/COMMENT comment # IF NOT have bold THEN out and next char # haven't reached a bold word # ELSE STRING word = get bold word; # at the start of a bold word # print( ( word ) ); IF word = comment delimiter THEN # reached the end of the comment # italic delimiter; comment delimiter := "" FI FI ELIF c = """" THEN # start of a string or character denotation # out and next char; in string := TRUE ELIF c = "#" THEN # start of a brief comment such as this one # italic delimiter; out and next char; in brief comment := TRUE ELIF c = "{" AND rs style brief comments THEN # nestable brief # italic delimiter; # comment ( ALGOL 68RS and algol68toc ) # out and next char; rs comment depth := 1 ELIF have bold THEN # have a bold word # STRING word = get bold word; IF word /= "CO" AND word /= "COMMENT" THEN print( ( "'''", word, "'''" ) ) # non-comment bold word # ELSE italic delimiter; # start of a bold comment # print( ( word ) ); comment delimiter := word FI ELSE # anything else # out and next char FI OD; IF in string THEN print( ( "**** unterminated string", newline ) ) ELIF in brief comment THEN print( ( "**** unterminated brief comment", newline ) ) ELIF rs comment depth > 0 THEN print( ( "**** unterminated {...} comment", newline ) ) ELIF comment delimiter /= "" THEN print( ( "**** unterminated ", comment delimiter, newline ) ) FI END
AWK
Parsing of patterns may not be correct in all cases.
# convert an AWK source to wiki format # each line is preceeded by a space, # reserved words are enclosed in ''' and ''' and comments in '' and '' # ', &, < and > are converted to ' & < and > # everything else if output as is # the wiki source is written to stdout BEGIN \ { # reserved word list as in gawk and treating getline as reserved kw = "BEGIN/BEGINFILE/END/ENDFILE/" \ "break/case/continue/default/delete/do/while/else/" \ "exit/for/in/function/func/if/next/nextfile/switch/" \ "getline"; n = split( kw, reservedWords, "/" ); for( w = 1; w <= n; w ++ ) { reserved[ reservedWords[ w ] ] = w; } } # BEGIN { printf( " " ); line = $0; gsub( /&/, "\\&", line ); gsub( /</, "\\<", line ); gsub( />/, "\\>", line ); gsub( /'/, "\\'", line ); if( line != "" ) { c = ""; nextChar(); do { if ( c == "#" ) { # comment printf( "''#%s''", line ); c = ""; } else if( c == "\"" ) { # string literal do { if( c == "\\" ) { printf( "%s", c ); nextChar(); } printf( "%s", c ); nextChar(); } while( c != "\"" && c != "" ); if( c != "\"" ) { printf( "**** Unterminated string\n" ); } else { nextChar(); } printf( "\"" ); } else if( c == "/" && lastC !~ /[A-Za-z0-9_.]/ ) { # pattern bracketDepth = 0; printf( "%s", c ); nextChar(); while( c != "" && ( c != "/" || bracketDepth > 0 ) ) { if( c == "\\" || c == "[" ) { if ( c == "[" ) { bracketDepth ++; } printf( "%s", c ); nextChar(); } else if( c == "]" ) { bracketDepth --; } printf( "%s", c ); nextChar(); } if( c != "/" ) { printf( "**** Unterminated pattern\n" ); } else { nextChar(); } printf( "/" ); } else if( c ~ /[A-Za-z]/ ) { # have a reserved word or identifier word = ""; do { word = word c; nextChar(); } while( c ~ /[A-Za-z0-9_]/ ); if( word in reserved ) { word = "'''" word "'''"; } printf( "%s", word ); } else { # something else printf( "%s", c ); nextChar(); } } while( c != "" ); } printf( "\n" ); } function nextChar() { if( c != " " ) { # the last character wasn't a space, save it so we can recognise patterns lastC = c; } if( line == "" ) { # at end of line c = ""; } else { # not end of line c = substr( line, 1, 1 ); line = substr( line, 2 ); } } # nextChar
Julia
#= Keywords in Julia. Handles two word reserved keywords. #= Also #= handles nested comments such as this. =# =# =# const KEYWORDS = map( w -> Regex("^" * w * "\\W"), sort( [ raw"abstract\s+type", "baremodule", "begin", "break", "catch", "const", "continue", "do", "else", "elseif", "end", "export", "false", "finally", "for", "function", "global", "if", "import", "in", "isa", "let", "local", "macro", "module", raw"mutable\s+struct", "outer", raw"primitive\s+type", "quote", "return", "struct", "true", "try", "using", "while", "where", ], rev = true, by = length), ) # reorder to largest first then convert to Regex """ Find the #= =# delineated comment, including nested versions """ function nestedcommentlimits(s::AbstractString, startcomment = "#=", stopcomment = "=#") either = Regex("$startcomment|$stopcomment", "sa") depth, startpos, stoppos = 0, 0, 0 for (i, m) in enumerate(eachmatch(either, s)) if m.match == startcomment startpos = startpos == 0 ? m.match.offset : startpos depth += 1 else stoppos = max(stoppos + 1, m.match.offset + 2) depth -= 1 end depth <= 0 && break end return startpos, stoppos end """ Given a string, output a string that has been modified by adding surrounding \'\' or \'\'\' bracketing for syntax highlighting of keywords and comments """ function partialhighlight(txt) outtxt = Char[] idx, len = 1, length(txt) while idx <= len if !isvalid(txt, idx) idx += 1 continue end c = txt[idx] if c == '\\' push!(outtxt, c, txt[idx+1]) idx += 2 elseif c == '\"' if idx < len - 2 && c == txt[idx+1] == txt[idx+2] qlen = findfirst(r"(?<!\\)\"\"\""sa, txt[idx+3:end]) qlen == nothing && error("error with terminator of quote at $idx") append!(outtxt, collect(replace(txt[idx:idx+qlen.stop+2], "\n" => "\n "))) idx += qlen.stop + 3 else qlen = findfirst(r"(?<!\\)\"", txt[idx+1:end]) qlen == nothing && error("error with terminator of quote at $idx") append!(outtxt, collect(replace(txt[idx:idx+qlen.stop+1], "\n" => "\n "))) outtxt[end] == '\n' && push!(outtxt, ' ') idx += qlen.stop + 2 end elseif c == '#' && txt[max(1, idx - 1)] != ''' if idx < len && txt[idx+1] == '=' start, stop = nestedcommentlimits(txt[idx:end]) s = replace(txt[idx:idx+stop-1], "\n" => "\n ") append!(outtxt, collect("$s")) idx += stop else newlinepos = something(findfirst(==('\n'), txt[idx+1:end]), len - idx) append!(outtxt, collect("$(txt[idx:idx+newlinepos-1])")) idx += newlinepos end elseif c ∈ 'a':'z' for (j, reg) in enumerate(KEYWORDS) m = match(reg, txt[idx:end]) if m != nothing wlen = m.match.ncodeunits - 2 append!(outtxt, collect("$(txt[idx:idx+wlen])")) idx += wlen + 1 break elseif j == lastindex(KEYWORDS) push!(outtxt, c) idx += 1 end end elseif c in [''', '&', '<', '>'] s = c == ''' ? "'" : c == '&' ? "&" : c == '<' ? "<" : ">" append!(outtxt, collect(s)) idx += 1 else push!(outtxt, c) idx += 1 end outtxt[end] == '\n' && push!(outtxt, ' ') end return String(outtxt) end println(partialhighlight(read(PROGRAM_FILE, String)), "\n")
Phix
Note the utility I use for this on a day-to-day basis (pwa/p2js.exw/<Ctrl M>) must be easily over 50,000 lines of code by now...
The following is deliberately the simplest possible thing that gets the job done, and there are of course 1,001 things missing:
No support for [multiline] shebangs, C-style comments, nested block comments, or (as noted) Eu-compatible block comments; and keywords c/should easily be several hundred entries long, and tested/constructed using A-Z and 0-9, ...
-- -- demo\rosetta\syntax_highlight.exw -- ================================= -- string pgm = substitute(get_text(command_line()[$]),"\r\n","\n") -- or(/for javascript compatibility) specify constant pgm = """...""" constant qqq = `""`&`"`, /* (split to assist with permitting ^^^) */ keywords = {`and`,`assert`,`bool`,`command_line`,`constant`,`do`,`else`,`elsif`,`end`, `find`,`for`,`function`,`get_text`,`if`,`iff`,`in`,`integer`,`length`,`match`,`not`, `procedure`,`puts`,`return`,`sequence`,`string`,`substitute`,`then`,`wait_key`,`while`}, htmlify = {"'&<>",{`apos`,`amp`,`lt`,`gt`}} integer i = 1, l = length(pgm), word_start = 0 string out = " " procedure spacenl(sequence s) for ch in s do integer k = find(ch,htmlify[1]) if k then ch = '&' & htmlify[2][k] & ';' end if out &= ch if ch='\n' then out &= ' ' end if end for end procedure function do_string(integer i, ni, l, string stype) assert(ni>0,"%d quoted string not closed",{stype}) ni += l spacenl(pgm[i..ni]) return ni end function while i<=l do integer ch = pgm[i] if (ch>='a' and ch<='z') or ch='_' then if not word_start then word_start := i end if else if word_start then string one_word = pgm[word_start..i-1] bool is_key = find(one_word,keywords) if is_key then out &= `'''` end if out &= one_word if is_key then out &= `'''` end if word_start = 0 end if if ch='-' and i<l and pgm[i+1]='-' then -- nb: does not handle --/* style comments integer line_comment = i while i<l and pgm[i+1]!='\n' do i += 1 end while out &= `''` spacenl(pgm[line_comment..i]) out &= `''` elsif ch='/' and i<l and pgm[i+1]='*' then -- nb: does not handle nested block comments integer block_comment = i i = match(`*/`,pgm,i+2)+1 assert(i>1,"missing closing block comment") out &= `''` spacenl(pgm[block_comment..i]) out &= `''` elsif ch='"' then if i+1<l and pgm[i..i+2]=qqq then i = do_string(i,match(qqq,pgm,i+3),2,"triple") else i = do_string(i,find('"',pgm,i+1),0,"double") end if elsif find(ch,"`'") then string stype = iff(ch='`'?"backtick":"single") i = do_string(i,find(ch,pgm,i+1),0,stype) else spacenl({ch}) end if end if i += 1 end while puts(1,out) {} = wait_key()
Python
This solution builds on lexers available in Pygments by defining a formatter outputting simple MediaWiki markup, and a filter to translate characters to HTML escape sequences. Note that I've taken liberties with said escaping.
"""Syntax highlighting using Mediawiki formatting.""" from html import escape from textwrap import indent from io import StringIO from pygments import highlight from pygments.filter import Filter from pygments.formatter import Formatter from pygments.lexers import get_lexer_by_name from pygments.token import Token class MediaWikiFormatter(Formatter): """Format source code using MediaWiki markup.""" name = "MediaWiki" aliases = ["mediawiki", "wiki"] filenames = [] def __init__(self, **options): super().__init__(**options) self.indent = options.get("indent", " ") self.styles = { Token: ("", ""), Token.Comment: ("''", "''"), Token.Keyword: ("'''", "'''"), Token.String.Doc: ("''", "''"), } def format(self, token_source, outfile): buffer = StringIO() last_val = "" last_type = None for token_type, value in token_source: # Work up the token hierarchy until a style is found. while token_type not in self.styles: token_type = token_type.parent # Group consecutive tokens of the same type. if token_type == last_type: last_val += value else: if last_val: style_begin, style_end = self.styles[last_type] buffer.write(style_begin + last_val + style_end) last_val = value last_type = token_type # Flush remaining values. if last_val: style_begin, style_end = self.styles[last_type] buffer.write(style_begin + last_val + style_end) # Write indented lines to the output file. outfile.write( indent( buffer.getvalue(), self.indent, lambda _: True, ) ) class HTMLEscapeFilter(Filter): """Convert the characters &, <, > and ' to HTML-safe sequences.""" def __init__(self, **options): super().__init__(**options) def filter(self, _, stream): for ttype, value in stream: yield ttype, escape(value) def main(language_name="python", infile=None): formatter = MediaWikiFormatter() lexer = get_lexer_by_name(language_name) lexer.add_filter(HTMLEscapeFilter()) with open(infile or __file__) as fd: print(highlight(fd.read(), lexer, formatter), end="") if __name__ == "__main__": main()
Wren
Note that, rightly or wrongly, this code would not highlight keywords occurring in interpolated string expressions.
// Convert a Wren source to "wiki" format: // each line is preceded by a space // keywords are enclosed in ''' and ''' and comments in '' and '' // ', &, < and > are converted to ' & < and > // everything else is output as is // The source is read from a file and written to standard output. // The file name should be passed as a command line argument. import "./ioutil" for FileUtil import "os" for Process var keywords = [ "as", "break", "class", "construct", "continue", "else", "false", "for", "foreign", "if", "in", "is", "import", "null", "return", "static", "super", "this", "true", "var", "while" ] var alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_." var highlight = Fn.new { |lines| var inStr = false // within a string literal var inRaw = false // within a raw string literal var inCom = false // within a multi-line comment for (line in lines) { System.write(" ") line = line.replace("&", "&").replace("'", "'") .replace("<", "<").replace(">", ">") var word = "" var chrs = line.toList // convert to list of unicode characters var i = 0 if (inCom) System.write("''") while (i < chrs.count) { var c = chrs[i] if (inCom) { // if inside a multi-line comment if (c == "*" && i < chrs.count-1 && chrs[i+1] == "/") { inCom = false System.write("*/''") i = i + 1 } else { System.write(c) } } else if (inStr && c == "\\" && i < chrs.count-1 && chrs[i+1] == "\"") { /* escaped double quote in string literal */ System.write("\\\"") i = i + 1 } else if (c == "\"") { // any other double quote if (i > 1 && chrs[i-2] == "\"" && chrs[i-1] == "\"") { inRaw = !inRaw } else if (!inRaw) { inStr = !inStr } System.write("\"") } else if (inStr || inRaw) { // otherwise if within a string just write it System.write(c) } else if (c == "/") { // forward slash if (i < chrs.count-1 && chrs[i+1] == c) { System.write("''" + chrs[i..-1].join() + "''") break } else if (i < chrs.count-1 && chrs[i+1] == "*") { inCom = true System.write("''" + "/*") i = i + 1 } else { System.write(c) } } else if (alphabet.contains(c)) { // if eligible, add to current word word = word + c } else if (keywords.contains(word)) { // if it's a keyword, embolden it System.write("'''" + word + "'''" + c) word = "" } else { // otherwise just write the word System.write(word + c) word = "" } i = i + 1 } if (inCom) { System.write("''") } else if (word != "") { if (keywords.contains(word)) { System.write("'''" + word + "'''") } else { System.write(word) } } System.print() } } var args = Process.arguments if (args.count != 1) { /* make sure double quotes and keywords in raw strings are handled properly */ Fiber.abort("""Please pass the file name to be highlighted "as" the only argument.""") } var lines = FileUtil.readLines(args[0]) highlight.call(lines) /* this code should now be saved to syntax_highlighting.wren */