Syntax highlighting using Mediawiki formatting: Difference between revisions

Content deleted Content added

Inline

Revision as of 01:40, 8 October 2023

Introduction

When formatting a page for display, Mediawiki allows the page to include bold and italic text by placing the bold/italic text within paired repeated-single quote characters - 3 single quotes for bold and 2 for italic, 5 for bold italic.
E.g.: '''bold-word''' and ''italic-word'' appears as bold-word and italic-word.

This could be used to provide simple syntax-highlighting without the use of the relatively more expensive <syntachighlight> tags or for languages not currently supported by Pygments. A few languages on Rosetta Code are currently using schemes like this.

Task

The task is to write a syntax highlighter that given a source in your language will output a wiki formatted version of the source with the keywords/reserved words in bold and the comments in italics.
Note that each source line (including blank lines) should be output with a leading space, to ensure the source is treated as a single block.

Additionally, translate the following characters:

single-quote (') to '
ampersand (&) to &
less-than (<) to <
greater-than (>) to >

If your language doesn't have keywords/reserved words or comments, use your judgement on what to highlight in bold or italic : )

Presenting your source

Instead of showing your source within syntaxhighlight tags and having a separate output block, just show the source that would be output from your program when given its own source to process.
I.e., don't use syntaxhighlight tags.

ALGOL 68

Handles upper-stropping Algol 68 sources (as used by ALGOL 68G and most other compilers).

# Convert an upper-stropped Algol 68 source to "wiki" format                  #
#    each line is preceeded by a space,                                       #
#    bold words are enclosed in ''' and ''' and comments in '' and ''         #
#    ', &, < and > are converted to &apos; &amp; &lt; and &gt;                #
#    everything else if output as is                                          #
# the source is read from stand in and written to stand out                   #
# the last line in the file must end with a newline                           #
# { and } are assumed to be alternatives for ( and ), if { } should be        #
#     treated as comments ( as in ALGOL68RS/algol68toc )                      #
#         change rs style brief comments to TRUE                              #
BEGIN

    # TRUE if {} delimits a nestable brief comment, as in ALGOL 68RS and      #
    #      algol68toc, FALSE if {} are alternatives to () as in ALGOL 68G     #
    BOOL rs style brief comments = FALSE;

    BOOL at eof := FALSE;     # TRUE if EOF has been reached, FALSE otherwise #
    on logical file end( stand in              # set EOF handler for stand in #
                       , ( REF FILE f )BOOL:
                             # note that we reached EOF on the latest read    #
                             # and return TRUE so processing can continue     #
                             at eof := TRUE
                       );
    CHAR   nl        = REPR 10;                           # newline character #
    INT error count := 0;                         # number of errors reported #
    STRING line     := nl;                              # current source line #
    INT    pos      := LWB line;                   # current position in line #
    CHAR   c        := " ";                        # current source character #

    PROC error = ( STRING message )VOID:                   # reports an error #
         BEGIN
            error count +:= 1;
            print( ( newline, newline, "**** ", message, newline ) )
         END # error # ;
    # reports an unterminated construct ( e.g. string, comment )              #
    PROC unterminated = ( STRING construct )VOID: error( "Unterminated " + construct );
    PROC next char = VOID:   # gets the next source character, stores it in c #
         IF pos <= UPB line THEN
             c := line[ pos ];          # not past the end of the source line #
             pos +:= 1
         ELIF        # past the end of the current source line - get the next #
             at eof := FALSE;
             read( ( line, newline ) );
             NOT at eof
         THEN
             line +:= nl;                                 # have another line #
             pos   := LWB line;
             c     := line[ pos ];
             pos  +:= 1
         ELSE
             line := "";                                        # reached eof #
             c    := REPR 0
         FI # next char # ;
    PROC out char = ( CHAR ch )VOID:               # conveerts and outputs ch #
         IF   ch = nl  THEN print( ( newline, " " ) )
         ELIF ch = "<" THEN print( ( "&lt;" ) )
         ELIF ch = ">" THEN print( ( "&gt;" ) )
         ELIF ch = "&" THEN print( ( "&amp;" ) )
         ELIF ch = "'" THEN print( ( "&apos;" ) )
         ELSE print( ch )
         FI # out char # ;
    # outputs a wiki start/end italic delimiter                               #
    PROC italic delimiter = VOID: print( ( "''" ) );
    # outputs a wiki start/end bold delimiter                                 #
    PROC bold delimiter = VOID: print( ( "'''" ) );
    # returns TRUE if the current character is a string delimiter             #
    PROC have string delimiter = BOOL: c = """";
    # returns TRUE if the current character can start a bold word             #
    PROC have bold = BOOL: c >= "A" AND c <= "Z";
    # outputs a brief comment to stand out                                    #
    #    end char is the closing delimiter,                                   #
    #    nested char is the opening delimiter for nestable brief comments     #
    #        if nested char is blank, the brief comment does not nest         #
    #    this handles ALGOL 68RS and algol68toc style {} comments             #
    PROC copy brief comment = ( CHAR end char, CHAR nested char )VOID:
         BEGIN
            out char( c );
            WHILE next char;
                  NOT at eof AND c /= end char
            DO
                IF c = nested char AND nested char /= " " THEN
                    # nested brief comment                                    #
                    copy brief comment( end char, nested char )
                ELSE
                    # notmal comment char                                     #
                    out char( c )
                FI
            OD;
            IF at eof THEN
                # unterminated comment                                        #
                unterminated( """" + end char + """ comment" );
                c := end char
            FI;
            out char( c );
            next char
         END # copy brief comment # ;
    PROC copy string = VOID:    # outputs a string denotation from the source #
         WHILE have string delimiter DO         # within a string denotation, #
            WHILE out char( c );                 # "" denotes the " character #
                  next char;
                  NOT at eof AND NOT have string delimiter
            DO SKIP OD;
            IF NOT have string delimiter THEN
                unterminated( "string" );
                c := """"
            FI;
            out char( c );
            next char
         OD # copy string # ;
    PROC get bold word = STRING:          # gets a bold word from then source #
         BEGIN
            STRING result := "";
            WHILE have bold OR c = "_" DO result +:= c; next char OD;
            result
         END # get bold word # ;
    PROC copy to bold = STRING:             # copies the source to the output #
         IF at eof                         # until a bold word is encountered #
         THEN ""
         ELSE STRING result := "";
              WHILE out char( c );
                    next char;
                    NOT at eof
                AND NOT have bold
              DO SKIP OD;
              IF NOT at eof THEN result := get bold word FI;
              result
         FI # copy to bold # ;
    PROC bold word or comment = VOID:                # handles a bold COMMENT #
         IF STRING bold word := get bold word;           # or other bold word #
            bold word = "CO" OR bold word = "COMMENT"
         THEN
            italic delimiter;                           # have a bold comment #
            STRING delimiter = bold word;
            WHILE print( ( bold word ) );
                  bold word := copy to bold;
                  NOT at eof
              AND bold word /= delimiter
            DO SKIP OD;
            IF at eof THEN
                unterminated( """" + delimiter + """ comment" )
            FI;
            print( ( delimiter ) );
            italic delimiter
         ELSE                                          # some other bold word #
            bold delimiter;
            print( ( bold word ) );
            bold delimiter
         FI # bold word or comment # ;

    # copy the source to stand out, conveerting to wiki format                #
    next char;
    WHILE NOT at eof DO
        IF   c = "#" THEN                                     # brief comment #
            italic delimiter;
            copy brief comment( "#", " " );
            italic delimiter
        ELIF c = "{" AND rs style brief comments THEN
            # nestable brief comment ( ALGOL 68RS and algol68toc )            #
            italic delimiter;
            copy brief comment( "}", "{" );
            italic delimiter
        ELIF have string delimiter THEN           # STRING or CHAR denotation #
            copy string
        ELIF have bold THEN                                # have a bold word #
            bold word or comment
        ELSE
            # anything else                                                   #
            out char( c );
            next char
        FI
    OD;

    IF error count > 0 THEN
        # had errors processing the source                                    #
        print( ( "**** ", whole( error count, 0 ), " errors", newline ) )
    FI

END

AWK

Parsing of patterns may not be correct in all cases.

# convert an AWK source to wiki format
#    each line is preceeded by a space,
#    reserved words are enclosed in ''' and ''' and comments in '' and ''
#    ', &, < and > are converted to &apos; &amp; &lt; and &gt;
#    everything else if output as is
# the wiki source is written to stdout

BEGIN \
{

    # reserved word list as in gawk and treating getline as reserved 
    kw = "BEGIN/BEGINFILE/END/ENDFILE/"                         \
         "break/case/continue/default/delete/do/while/else/"    \
         "exit/for/in/function/func/if/next/nextfile/switch/"   \
         "getline";
    n   = split( kw, reservedWords, "/" );
    for( w = 1; w <= n; w ++ )
    {
        reserved[ reservedWords[ w ] ] = w;
    }

} # BEGIN

{

    printf( " " );
    line = $0;
    gsub( /&/, "\\&amp;",  line );
    gsub( /</, "\\&lt;",   line );
    gsub( />/, "\\&gt;",   line );
    gsub( /'/, "\\&apos;", line );

    if( line != "" )
    {
        c = "";
        nextChar();
        do
        {
            if     ( c == "#" )
            {
                # comment
                printf( "''#%s''", line );
                c = "";
            }
            else if( c == "\"" )
            {
                # string literal
                do
                {
                    if( c == "\\" )
                    {
                        printf( "%s", c );
                        nextChar();
                    }
                    printf( "%s", c );
                    nextChar();
                }
                while( c != "\"" && c != "" );
                if( c != "\"" )
                {
                    printf( "**** Unterminated string\n" );
                }
                else
                {
                    nextChar();
                }
                printf( "\"" );
            }
            else if( c == "/" && lastC !~ /[A-Za-z0-9_.]/ )
            {
                # pattern
                bracketDepth = 0;
                printf( "%s", c );
                nextChar();
                while( c != "" && ( c != "/" || bracketDepth > 0 ) )
                {
                    if( c == "\\" || c == "[" )
                    {
                        if     ( c == "[" )
                        {
                            bracketDepth ++;
                        }
                        printf( "%s", c );
                        nextChar();
                    }
                    else if( c == "]" )
                    {
                        bracketDepth --;
                    }
                    printf( "%s", c );
                    nextChar();
                }
                if( c != "/" )
                {
                    printf( "**** Unterminated pattern\n" );
                }
                else
                {
                    nextChar();
                }
                printf( "/" );
            }
            else if( c ~ /[A-Za-z]/ )
            {
                # have a reserved word or identifier
                word = "";
                do
                {
                    word = word c;
                    nextChar();
                }
                while( c ~ /[A-Za-z0-9_]/ );
                if( word in reserved )
                {
                    word = "'''" word "'''";
                }
                printf( "%s", word );
            }
            else
            {
                # something else
                printf( "%s", c );
                nextChar();
            }
        }
        while( c != "" );
    }
    printf( "\n" );

}

function nextChar()
{
    if( c != " " )
    {
        # the last character wasn't a space, save it so we can recognise patterns
        lastC = c;
    }
    if( line == "" )
    {
        # at end of line
        c    = "";
    }
    else
    {
        # not end of line
        c    = substr( line, 1, 1 );
        line = substr( line, 2 );
    }

} # nextChar

Phix

Note the utility I use for this on a day-to-day basis (pwa/p2js.exw/<Ctrl M>) must be easily over 50,000 lines of code by now...
The following is deliberately the simplest possible thing that gets the job done, and there are of course 1,001 things missing: No support for [multiline] shebangs, C-style comments, nested block comments, or (as noted) Eu-compatible block comments; and keywords c/should easily be several hundred entries long, and tested/constructed using A-Z and 0-9, ...

--
-- demo\rosetta\syntax_highlight.exw
-- =================================
--
string pgm = substitute(get_text(command_line()[$]),"\r\n","\n")
-- or(/for javascript compatibility) specify constant pgm = """...""" 
constant qqq = `""`&`"`, /* (split to assist with permitting ^^^) */
         keywords = {`and`,`assert`,`bool`,`command_line`,`constant`,`do`,`else`,`elsif`,`end`,
        `find`,`for`,`function`,`get_text`,`if`,`iff`,`in`,`integer`,`length`,`match`,`not`,
        `procedure`,`puts`,`return`,`sequence`,`string`,`substitute`,`then`,`wait_key`,`while`},
         htmlify = {"'&<>",{`apos`,`amp`,`lt`,`gt`}}
integer i = 1, l = length(pgm), word_start = 0
string out = " "

procedure spacenl(sequence s)
    for ch in s do
        integer k = find(ch,htmlify[1])
        if k then ch = '&' & htmlify[2][k] & ';' end if
        out &= ch
        if ch='\n' then out &= ' ' end if
    end for
end procedure

function do_string(integer i, ni, l, string stype)
    assert(ni>0,"%d quoted string not closed",{stype})
    ni += l
    spacenl(pgm[i..ni])
    return ni
end function

while i<=l do
    integer ch = pgm[i]
    if (ch>='a' and ch<='z') or ch='_' then
        if not word_start then word_start := i end if
    else
        if word_start then
            string one_word = pgm[word_start..i-1]
            bool is_key = find(one_word,keywords)
            if is_key then out &= `'''` end if
            out &= one_word
            if is_key then out &= `'''` end if
            word_start = 0
        end if
        if ch='-' and i<l and pgm[i+1]='-' then
            -- nb: does not handle --/* style comments
            integer line_comment = i
            while i<l and pgm[i+1]!='\n' do i += 1 end while
            out &= `''`
            spacenl(pgm[line_comment..i])
            out &= `''`
        elsif ch='/' and i<l and pgm[i+1]='*' then
            -- nb: does not handle nested block comments
            integer block_comment = i
            i = match(`*/`,pgm,i+2)+1
            assert(i>1,"missing closing block comment")
            out &= `''`
            spacenl(pgm[block_comment..i])
            out &= `''`
        elsif ch='"' then
            if i+1<l and pgm[i..i+2]=qqq then
                i = do_string(i,match(qqq,pgm,i+3),2,"triple")
            else
                i = do_string(i,find('"',pgm,i+1),0,"double")
            end if
        elsif find(ch,"`'") then
            string stype = iff(ch='`'?"backtick":"single")
            i = do_string(i,find(ch,pgm,i+1),0,stype)
        else
            spacenl({ch})
        end if
    end if
    i += 1
end while
puts(1,out)
{} = wait_key()

Python

Library: pygments

This solution builds on lexers available in Pygments by defining a formatter outputting simple MediaWiki markup, and a filter to translate characters to HTML escape sequences. Note that I've taken liberties with said escaping.

"""Syntax highlighting using Mediawiki formatting."""
from html import escape
from textwrap import indent

from pygments import highlight
from pygments.filter import Filter
from pygments.formatter import Formatter
from pygments.lexers import get_lexer_by_name
from pygments.token import Token


class MediaWikiFormatter(Formatter):
    """Format source code using MediaWiki markup."""

    def __init__(self, **options):
        super().__init__(**options)

        self.styles = {
            Token: ("", ""),
            Token.Comment: ("''", "''"),
            Token.Keyword: ("'''", "'''"),
            Token.String.Doc: ("''", "''"),
        }

    def format(self, token_source, outfile):
        last_val = ""
        last_type = None

        for token_type, value in token_source:
            # Work up the token hierarchy until a style is found.
            while token_type not in self.styles:
                token_type = token_type.parent

            # Group consecutive tokens of the same type.
            if token_type == last_type:
                last_val += value
            else:
                if last_val:
                    style_begin, style_end = self.styles[last_type]
                    outfile.write(style_begin + last_val + style_end)

                last_val = value
                last_type = token_type

        # Flush remaining values.
        if last_val:
            style_begin, style_end = self.styles[last_type]
            outfile.write(style_begin + last_val + style_end)


class HTMLEscapeFilter(Filter):
    """Convert the characters &, <, > and ' to HTML-safe sequences."""

    def __init__(self, **options):
        super().__init__(**options)

    def filter(self, _, stream):
        for ttype, value in stream:
            yield ttype, escape(value)


def main(language_name="python", infile=None):
    formatter = MediaWikiFormatter(style="bw")
    lexer = get_lexer_by_name(language_name)
    lexer.add_filter(HTMLEscapeFilter())

    with open(infile or __file__) as fd:
        print(
            indent(
                highlight(fd.read(), lexer, formatter),
                " ",
                lambda line: True,
            ),
            end="",
        )


if __name__ == "__main__":
    main()

@@ Line 379: / Line 379: @@
  ''-- =================================''
  ''--''
- '''sequence''' cl = '''command_line'''()
+ '''string''' pgm = '''substitute'''('''get_text'''('''command_line'''()[$]),"\r\n","\n")
- '''string''' pgm = '''substitute'''('''get_text'''(cl[$]),"\r\n","\n")
  ''-- or(/for javascript compatibility) specify constant pgm = """...""" ''
  '''constant''' qqq = `""`&amp;`"`, ''/* (split to assist with permitting ^^^) */''