Syntax highlighting using Mediawiki formatting: Difference between revisions
m →{{header|Phix}}: iff() |
m →{{header|Phix}}: inlined the command_line() call |
||
Line 379: | Line 379: | ||
''-- ================================='' |
''-- ================================='' |
||
''--'' |
''--'' |
||
''' |
'''string''' pgm = '''substitute'''('''get_text'''('''command_line'''()[$]),"\r\n","\n") |
||
'''string''' pgm = '''substitute'''('''get_text'''(cl[$]),"\r\n","\n") |
|||
''-- or(/for javascript compatibility) specify constant pgm = """...""" '' |
''-- or(/for javascript compatibility) specify constant pgm = """...""" '' |
||
'''constant''' qqq = `""`&`"`, ''/* (split to assist with permitting ^^^) */'' |
'''constant''' qqq = `""`&`"`, ''/* (split to assist with permitting ^^^) */'' |
Revision as of 01:40, 8 October 2023
Introduction
When formatting a page for display, Mediawiki allows the page to include bold and italic text by placing the bold/italic text within paired repeated-single quote characters - 3 single quotes for bold and 2 for italic, 5 for bold italic.
E.g.:
'''bold-word''' and ''italic-word'' appears as bold-word and italic-word.
This could be used to provide simple syntax-highlighting without the use of the relatively more expensive <syntachighlight> tags or for languages not currently supported by Pygments.
A few languages on Rosetta Code are currently using schemes like this.
- Task
The task is to write a syntax highlighter that given a source in your language will output a wiki formatted version of the source with the keywords/reserved words in bold and the comments in italics.
Note that each source line (including blank lines) should be output with a leading space, to ensure the source is treated as a single block.
Additionally, translate the following characters:
- single-quote (') to '
- ampersand (&) to &
- less-than (<) to <
- greater-than (>) to >
If your language doesn't have keywords/reserved words or comments, use your judgement on what to highlight in bold or italic : )
Presenting your source
Instead of showing your source within syntaxhighlight tags and having a separate output block, just show the source that would be output from your program when given its own source to process.
I.e., don't use syntaxhighlight tags.
See also
https://www.mediawiki.org/wiki/Help:Formatting
ALGOL 68
Handles upper-stropping Algol 68 sources (as used by ALGOL 68G and most other compilers).
# Convert an upper-stropped Algol 68 source to "wiki" format # # each line is preceeded by a space, # # bold words are enclosed in ''' and ''' and comments in '' and '' # # ', &, < and > are converted to ' & < and > # # everything else if output as is # # the source is read from stand in and written to stand out # # the last line in the file must end with a newline # # { and } are assumed to be alternatives for ( and ), if { } should be # # treated as comments ( as in ALGOL68RS/algol68toc ) # # change rs style brief comments to TRUE # BEGIN # TRUE if {} delimits a nestable brief comment, as in ALGOL 68RS and # # algol68toc, FALSE if {} are alternatives to () as in ALGOL 68G # BOOL rs style brief comments = FALSE; BOOL at eof := FALSE; # TRUE if EOF has been reached, FALSE otherwise # on logical file end( stand in # set EOF handler for stand in # , ( REF FILE f )BOOL: # note that we reached EOF on the latest read # # and return TRUE so processing can continue # at eof := TRUE ); CHAR nl = REPR 10; # newline character # INT error count := 0; # number of errors reported # STRING line := nl; # current source line # INT pos := LWB line; # current position in line # CHAR c := " "; # current source character # PROC error = ( STRING message )VOID: # reports an error # BEGIN error count +:= 1; print( ( newline, newline, "**** ", message, newline ) ) END # error # ; # reports an unterminated construct ( e.g. string, comment ) # PROC unterminated = ( STRING construct )VOID: error( "Unterminated " + construct ); PROC next char = VOID: # gets the next source character, stores it in c # IF pos <= UPB line THEN c := line[ pos ]; # not past the end of the source line # pos +:= 1 ELIF # past the end of the current source line - get the next # at eof := FALSE; read( ( line, newline ) ); NOT at eof THEN line +:= nl; # have another line # pos := LWB line; c := line[ pos ]; pos +:= 1 ELSE line := ""; # reached eof # c := REPR 0 FI # next char # ; PROC out char = ( CHAR ch )VOID: # conveerts and outputs ch # IF ch = nl THEN print( ( newline, " " ) ) ELIF ch = "<" THEN print( ( "<" ) ) ELIF ch = ">" THEN print( ( ">" ) ) ELIF ch = "&" THEN print( ( "&" ) ) ELIF ch = "'" THEN print( ( "'" ) ) ELSE print( ch ) FI # out char # ; # outputs a wiki start/end italic delimiter # PROC italic delimiter = VOID: print( ( "''" ) ); # outputs a wiki start/end bold delimiter # PROC bold delimiter = VOID: print( ( "'''" ) ); # returns TRUE if the current character is a string delimiter # PROC have string delimiter = BOOL: c = """"; # returns TRUE if the current character can start a bold word # PROC have bold = BOOL: c >= "A" AND c <= "Z"; # outputs a brief comment to stand out # # end char is the closing delimiter, # # nested char is the opening delimiter for nestable brief comments # # if nested char is blank, the brief comment does not nest # # this handles ALGOL 68RS and algol68toc style {} comments # PROC copy brief comment = ( CHAR end char, CHAR nested char )VOID: BEGIN out char( c ); WHILE next char; NOT at eof AND c /= end char DO IF c = nested char AND nested char /= " " THEN # nested brief comment # copy brief comment( end char, nested char ) ELSE # notmal comment char # out char( c ) FI OD; IF at eof THEN # unterminated comment # unterminated( """" + end char + """ comment" ); c := end char FI; out char( c ); next char END # copy brief comment # ; PROC copy string = VOID: # outputs a string denotation from the source # WHILE have string delimiter DO # within a string denotation, # WHILE out char( c ); # "" denotes the " character # next char; NOT at eof AND NOT have string delimiter DO SKIP OD; IF NOT have string delimiter THEN unterminated( "string" ); c := """" FI; out char( c ); next char OD # copy string # ; PROC get bold word = STRING: # gets a bold word from then source # BEGIN STRING result := ""; WHILE have bold OR c = "_" DO result +:= c; next char OD; result END # get bold word # ; PROC copy to bold = STRING: # copies the source to the output # IF at eof # until a bold word is encountered # THEN "" ELSE STRING result := ""; WHILE out char( c ); next char; NOT at eof AND NOT have bold DO SKIP OD; IF NOT at eof THEN result := get bold word FI; result FI # copy to bold # ; PROC bold word or comment = VOID: # handles a bold COMMENT # IF STRING bold word := get bold word; # or other bold word # bold word = "CO" OR bold word = "COMMENT" THEN italic delimiter; # have a bold comment # STRING delimiter = bold word; WHILE print( ( bold word ) ); bold word := copy to bold; NOT at eof AND bold word /= delimiter DO SKIP OD; IF at eof THEN unterminated( """" + delimiter + """ comment" ) FI; print( ( delimiter ) ); italic delimiter ELSE # some other bold word # bold delimiter; print( ( bold word ) ); bold delimiter FI # bold word or comment # ; # copy the source to stand out, conveerting to wiki format # next char; WHILE NOT at eof DO IF c = "#" THEN # brief comment # italic delimiter; copy brief comment( "#", " " ); italic delimiter ELIF c = "{" AND rs style brief comments THEN # nestable brief comment ( ALGOL 68RS and algol68toc ) # italic delimiter; copy brief comment( "}", "{" ); italic delimiter ELIF have string delimiter THEN # STRING or CHAR denotation # copy string ELIF have bold THEN # have a bold word # bold word or comment ELSE # anything else # out char( c ); next char FI OD; IF error count > 0 THEN # had errors processing the source # print( ( "**** ", whole( error count, 0 ), " errors", newline ) ) FI END
AWK
Parsing of patterns may not be correct in all cases.
# convert an AWK source to wiki format # each line is preceeded by a space, # reserved words are enclosed in ''' and ''' and comments in '' and '' # ', &, < and > are converted to ' & < and > # everything else if output as is # the wiki source is written to stdout BEGIN \ { # reserved word list as in gawk and treating getline as reserved kw = "BEGIN/BEGINFILE/END/ENDFILE/" \ "break/case/continue/default/delete/do/while/else/" \ "exit/for/in/function/func/if/next/nextfile/switch/" \ "getline"; n = split( kw, reservedWords, "/" ); for( w = 1; w <= n; w ++ ) { reserved[ reservedWords[ w ] ] = w; } } # BEGIN { printf( " " ); line = $0; gsub( /&/, "\\&", line ); gsub( /</, "\\<", line ); gsub( />/, "\\>", line ); gsub( /'/, "\\'", line ); if( line != "" ) { c = ""; nextChar(); do { if ( c == "#" ) { # comment printf( "''#%s''", line ); c = ""; } else if( c == "\"" ) { # string literal do { if( c == "\\" ) { printf( "%s", c ); nextChar(); } printf( "%s", c ); nextChar(); } while( c != "\"" && c != "" ); if( c != "\"" ) { printf( "**** Unterminated string\n" ); } else { nextChar(); } printf( "\"" ); } else if( c == "/" && lastC !~ /[A-Za-z0-9_.]/ ) { # pattern bracketDepth = 0; printf( "%s", c ); nextChar(); while( c != "" && ( c != "/" || bracketDepth > 0 ) ) { if( c == "\\" || c == "[" ) { if ( c == "[" ) { bracketDepth ++; } printf( "%s", c ); nextChar(); } else if( c == "]" ) { bracketDepth --; } printf( "%s", c ); nextChar(); } if( c != "/" ) { printf( "**** Unterminated pattern\n" ); } else { nextChar(); } printf( "/" ); } else if( c ~ /[A-Za-z]/ ) { # have a reserved word or identifier word = ""; do { word = word c; nextChar(); } while( c ~ /[A-Za-z0-9_]/ ); if( word in reserved ) { word = "'''" word "'''"; } printf( "%s", word ); } else { # something else printf( "%s", c ); nextChar(); } } while( c != "" ); } printf( "\n" ); } function nextChar() { if( c != " " ) { # the last character wasn't a space, save it so we can recognise patterns lastC = c; } if( line == "" ) { # at end of line c = ""; } else { # not end of line c = substr( line, 1, 1 ); line = substr( line, 2 ); } } # nextChar
Phix
Note the utility I use for this on a day-to-day basis (pwa/p2js.exw/<Ctrl M>) must be easily over 50,000 lines of code by now...
The following is deliberately the simplest possible thing that gets the job done, and there are of course 1,001 things missing:
No support for [multiline] shebangs, C-style comments, nested block comments, or (as noted) Eu-compatible block comments; and keywords c/should easily be several hundred entries long, and tested/constructed using A-Z and 0-9, ...
-- -- demo\rosetta\syntax_highlight.exw -- ================================= -- string pgm = substitute(get_text(command_line()[$]),"\r\n","\n") -- or(/for javascript compatibility) specify constant pgm = """...""" constant qqq = `""`&`"`, /* (split to assist with permitting ^^^) */ keywords = {`and`,`assert`,`bool`,`command_line`,`constant`,`do`,`else`,`elsif`,`end`, `find`,`for`,`function`,`get_text`,`if`,`iff`,`in`,`integer`,`length`,`match`,`not`, `procedure`,`puts`,`return`,`sequence`,`string`,`substitute`,`then`,`wait_key`,`while`}, htmlify = {"'&<>",{`apos`,`amp`,`lt`,`gt`}} integer i = 1, l = length(pgm), word_start = 0 string out = " " procedure spacenl(sequence s) for ch in s do integer k = find(ch,htmlify[1]) if k then ch = '&' & htmlify[2][k] & ';' end if out &= ch if ch='\n' then out &= ' ' end if end for end procedure function do_string(integer i, ni, l, string stype) assert(ni>0,"%d quoted string not closed",{stype}) ni += l spacenl(pgm[i..ni]) return ni end function while i<=l do integer ch = pgm[i] if (ch>='a' and ch<='z') or ch='_' then if not word_start then word_start := i end if else if word_start then string one_word = pgm[word_start..i-1] bool is_key = find(one_word,keywords) if is_key then out &= `'''` end if out &= one_word if is_key then out &= `'''` end if word_start = 0 end if if ch='-' and i<l and pgm[i+1]='-' then -- nb: does not handle --/* style comments integer line_comment = i while i<l and pgm[i+1]!='\n' do i += 1 end while out &= `''` spacenl(pgm[line_comment..i]) out &= `''` elsif ch='/' and i<l and pgm[i+1]='*' then -- nb: does not handle nested block comments integer block_comment = i i = match(`*/`,pgm,i+2)+1 assert(i>1,"missing closing block comment") out &= `''` spacenl(pgm[block_comment..i]) out &= `''` elsif ch='"' then if i+1<l and pgm[i..i+2]=qqq then i = do_string(i,match(qqq,pgm,i+3),2,"triple") else i = do_string(i,find('"',pgm,i+1),0,"double") end if elsif find(ch,"`'") then string stype = iff(ch='`'?"backtick":"single") i = do_string(i,find(ch,pgm,i+1),0,stype) else spacenl({ch}) end if end if i += 1 end while puts(1,out) {} = wait_key()
Python
This solution builds on lexers available in Pygments by defining a formatter outputting simple MediaWiki markup, and a filter to translate characters to HTML escape sequences. Note that I've taken liberties with said escaping.
"""Syntax highlighting using Mediawiki formatting.""" from html import escape from textwrap import indent from pygments import highlight from pygments.filter import Filter from pygments.formatter import Formatter from pygments.lexers import get_lexer_by_name from pygments.token import Token class MediaWikiFormatter(Formatter): """Format source code using MediaWiki markup.""" def __init__(self, **options): super().__init__(**options) self.styles = { Token: ("", ""), Token.Comment: ("''", "''"), Token.Keyword: ("'''", "'''"), Token.String.Doc: ("''", "''"), } def format(self, token_source, outfile): last_val = "" last_type = None for token_type, value in token_source: # Work up the token hierarchy until a style is found. while token_type not in self.styles: token_type = token_type.parent # Group consecutive tokens of the same type. if token_type == last_type: last_val += value else: if last_val: style_begin, style_end = self.styles[last_type] outfile.write(style_begin + last_val + style_end) last_val = value last_type = token_type # Flush remaining values. if last_val: style_begin, style_end = self.styles[last_type] outfile.write(style_begin + last_val + style_end) class HTMLEscapeFilter(Filter): """Convert the characters &, <, > and ' to HTML-safe sequences.""" def __init__(self, **options): super().__init__(**options) def filter(self, _, stream): for ttype, value in stream: yield ttype, escape(value) def main(language_name="python", infile=None): formatter = MediaWikiFormatter(style="bw") lexer = get_lexer_by_name(language_name) lexer.add_filter(HTMLEscapeFilter()) with open(infile or __file__) as fd: print( indent( highlight(fd.read(), lexer, formatter), " ", lambda line: True, ), end="", ) if __name__ == "__main__": main()