Jump to content

Text to HTML

From Rosetta Code
Text to HTML is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

When developing a Website it is occasionally necessary to handle text that is received without formatting, and present it in a pleasing manner. to achieve this the text needs to be converted to HTML.

Write a converter from plain text to HTML.

The plain text has no formatting information.

It may have centered headlines, numbered sections, paragraphs, lists, and URIs. It could even have tables.

Simple converters restrict themselves at identifying paragraphs, but i believe more can be done if the text is analyzed.

You are not requested to copy the algorithm from the existing solutions but use whatever faculties available in your language to best solve the problem.

The only requirement is to ensure that the result is valid xhtml.

C++

#include <cstdint>
#include <iostream>
#include <regex>
#include <string>
#include <vector>

const std::string WHITESPACE = " \n\r\t\f\v";

std::string trim(const std::string& text) {
	const size_t start = text.find_first_not_of(WHITESPACE);
	const size_t end = text.find_last_not_of(WHITESPACE);
	return ( start == std::string::npos ) ? "" : text.substr(start, end - start + 1);
}

std::string escape_HTML(const std::string& text) {
	std::string result = std::regex_replace(text, std::regex("&"), "&amp;");
	result = std::regex_replace(result, std::regex("<"), "&lt;");
	return std::regex_replace(result, std::regex(">"), "&gt;");
}

std::vector<std::string> split_paragraphs(const std::string& text) {
	    std::vector<std::string> paragraphs{ };
	    std::string temp = "";

	    for ( const char& ch : text ) {
	        if ( ch == '\n' ) {
	        	if ( ! temp.empty() ) {
	                paragraphs.emplace_back(temp);
	                temp = "";
	        	}
	        } else {
	            temp += ch;
	        }
	    }

	    if ( ! temp.empty() ) {
	        paragraphs.emplace_back(temp);
	    }
	    return paragraphs;
	}

int main() {
	std::string sample_text = R"(
			Sample Text
		This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.
		* This is a bulleted list with a less than sign (<)
		* And this is its second line with a greater than sign (>)
		A 'normal' paragraph between the lists.
		1. This is a numbered list with an ampersand (&)
		2. "Second line" in double quotes
		3. 'Third line' in single quotes
		That's all folks.
	)";

	bool bulleted_list = false;
	bool numbered_list = false;
	std::vector<std::string> paragraphs = split_paragraphs(escape_HTML(sample_text));
	const std::string title = ( paragraphs[0].find_first_not_of(WHITESPACE) > 0 ) ?
		trim(paragraphs[0]) : "Untitled";

	std::cout << "<html>" << std::endl;
	std::cout << "<head><title>" << title << "</title></head>" << std::endl;
	std::cout << "<body>" << std::endl;

	for ( uint32_t i = 1; i < paragraphs.size(); ++i ) {
		const std::string paragraph = trim(paragraphs[i]);

		if ( paragraph[0] == '*' ) {
			if ( ! bulleted_list ) {
				bulleted_list = true;
				std::cout << "<ul>" << std::endl;
			}
			std::cout << "  <li>" << trim(paragraph.substr(1)) << "</li>" << std::endl;
		} else if ( bulleted_list ) {
			bulleted_list = false;
			std::cout << "</ul>" << std::endl;
		}

		if ( paragraph[0] >= '0' && paragraph[0] <= '9' && paragraph[1] == '.' ) {
		    if ( ! numbered_list ) {
				numbered_list = true;
				std::cout << "<ol>" << std::endl;
		    }
		    std::cout << "  <li>" << trim(paragraph.substr(2)) << "</li>" << std::endl;
		} else if ( numbered_list ) {
			numbered_list = false;
			std::cout << "</ol>" << std::endl;
		}

		if ( ! bulleted_list && ! numbered_list ) {
			std::cout << "<p>" << trim(paragraph) << "</p>" << std::endl;
		}
	}

	if ( bulleted_list ) {
		std::cout << "</ul>" << std::endl;
	}
	if ( numbered_list ) {
		std::cout << "</ol>" << std::endl;
	}
	std::cout << "</body>" << std::endl;
	std::cout << "</html>" << std::endl;
}
Output:
<html>
<head><title>Sample Text</title></head>
<body>
<p>This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.</p>
<ul>
  <li>This is a bulleted list with a less than sign (<)</li>
  <li>And this is its second line with a greater than sign (>)</li>
</ul>
<p>A 'normal' paragraph between the lists.</p>
<ol>
  <li>This is a numbered list with an ampersand (&)</li>
  <li>"Second line" in double quotes</li>
  <li>'Third line' in single quotes</li>
</ol>
<p>That's all folks.</p>
<p></p>
</body>
</html>

FreeBASIC

Translation of: Go
Function replaceStr(text As String, find As String, repl As String) As String
    Dim As String result = text
    Dim As Integer posic = Instr(result, find)
    
    While posic > 0
        result = Left(result, posic - 1) & repl & Mid(result, posic + Len(find))
        posic = Instr(posic + Len(repl), result, find)
    Wend
    
    Return result
End Function

Function escapeHTML(text As String) As String
    Dim As String result = text
    result = replaceStr(result, "&", "&amp;")
    result = replaceStr(result, "<", "&lt;")
    result = replaceStr(result, ">", "&gt;")
    result = replaceStr(result, """", "&quot;")
    result = replaceStr(result, "'", "&#39;")
    Return result
End Function

Function startsWith(text As String, pattern As String) As Boolean
    Return Left(Trim(text), Len(pattern)) = pattern
End Function

Function isWhitespace(c As String) As Boolean
    Return c = " " Or c = Chr(9)
End Function

Function splitParagraphs(text As String) As String Ptr
    Static As String paragraphs(1000)
    Static As Integer cnt = 0
    
    Dim As String temp = ""
    Dim As Boolean wasEmpty = True
    
    For i As Integer = 1 To Len(text)
        Dim As String c = Mid(text, i, 1)
        If c = Chr(10) Then
            If Len(Trim(temp)) > 0 Then
                wasEmpty = False
                paragraphs(cnt) = temp
                cnt += 1
                temp = ""
            Elseif Not wasEmpty Then
                wasEmpty = True
            End If
        Else
            temp &= c
        End If
    Next
    
    If Len(Trim(temp)) > 0 Then
        paragraphs(cnt) = temp
        cnt += 1
    End If
    
    paragraphs(cnt) = Chr(0)  'Mark end
    Return @paragraphs(0)
End Function

' Sample text
Dim Shared As String sampleText
sampleText = "     Sample Text" & Chr(10) & Chr(10) & _
"This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists." & Chr(10) & Chr(10) & _
"* This is a bulleted list with a less than sign (<)" & Chr(10) & Chr(10) & _
"* And this is its second line with a greater than sign (>)" & Chr(10) & Chr(10) & _
"A 'normal' paragraph between the lists." & Chr(10) & Chr(10) & _
"1. This is a numbered list with an ampersand (&)" & Chr(10) & Chr(10) & _
"2. ""Second line"" in double quotes" & Chr(10) & Chr(10) & _
"3. 'Third line' in single quotes" & Chr(10) & Chr(10) & _
"That's all folks."

' Test code
Dim As String Ptr paragraphs = splitParagraphs(escapeHTML(sampleText))
Dim As Integer i = 0
Dim As String title = "Untitled"
Dim As Boolean blist = False, nlist = False

If isWhitespace(Left((*paragraphs), 1)) Then
    title = Trim(*paragraphs)
    i = 1
End If
Function isDigit(c As String) As Boolean
    Return (c >= "0" And c <= "9")
End Function


Print "<html>"
Print "<head><title>"; title; "</title></head>"
Print "<body>"

While Len(paragraphs[i])
    Dim As String para = Trim(paragraphs[i])
    
    If startsWith(para, "*") Then
        If Not blist Then
            blist = True
            Print "<ul>"
        End If
        Print "  <li>"; Trim(Mid(para, 2)); "</li>"
    Elseif blist Then
        blist = False
        Print "</ul>"
    End If
    
    If isDigit(Left(para, 1)) Andalso startsWith(para, Left(para, 1) & ".") Then
        If Not nlist Then
            nlist = True
            Print "<ol>"
        End If
        Print "  <li>"; Trim(Mid(para, 3)); "</li>"
    Elseif nlist Then
        nlist = False
        Print "</ol>"
    End If
    
    If Not blist Andalso Not nlist Andalso Not startsWith(para, "*") Andalso Not (isDigit(Left(para, 1)) Andalso startsWith(para, Left(para, 1) & ".")) Then
        Print "<p>"; para; "</p>"
    End If
    
    i += 1
Wend

If blist Then Print "</ul>"
If nlist Then Print "</ol>"
Print "</body>"
Print "</html>"
Output:
<html>
<head><title>Sample Text</title></head>
<body>
<p>This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.</p>
<ul>
  <li>This is a bulleted list with a less than sign (&lt;)</li>
  <li>And this is its second line with a greater than sign (&gt;)</li>
</ul>
<p>A &#39;normal&#39; paragraph between the lists.</p>
<ol>
  <li>This is a numbered list with an ampersand (&amp;)</li>
  <li>&#34;Second line&#34; in double quotes</li>
  <li>&#39;Third line&#39; in single quotes</li>
</ol>
<p>That&#39;s all folks.</p>
</body>
</html>

Go

This isn't very sophisticated but does a few things in a simple-minded way.

package main

import (
    "fmt"
    "html"
    "regexp"
    "strings"
)

var t = `     Sample Text

This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.

* This is a bulleted list with a less than sign (<)

* And this is its second line with a greater than sign (>)

A 'normal' paragraph between the lists. 

1. This is a numbered list with an ampersand (&)

2. "Second line" in double quotes

3. 'Third line' in single quotes

That's all folks.`

func main() {
    p := regexp.MustCompile(`\n\s*(\n\s*)+`)
    ul := regexp.MustCompile(`^\*`)
    ol := regexp.MustCompile(`^\d\.`)
    t = html.EscapeString(t) // escape <, >, &, " and '
    paras := p.Split(t, -1)

    // Assume if first character of first paragraph is white-space
    // then it's probably a document title.
    firstChar := paras[0][0]
    title := "Untitled"
    k := 0
    if firstChar == ' ' || firstChar == '\t' {
        title = strings.TrimSpace(paras[0])
        k = 1
    }
    fmt.Println("<html>")
    fmt.Printf("<head><title>%s</title></head>\n", title)
    fmt.Println("<body>")

    blist := false
    nlist := false
    for _, para := range paras[k:] {
        para2 := strings.TrimSpace(para)

        if ul.MatchString(para2) {
            if !blist {
                blist = true
                fmt.Println("<ul>")
            }
            para2 = strings.TrimSpace(para2[1:])
            fmt.Printf("  <li>%s</li>\n", para2)
            continue
        } else if blist {
            blist = false
            fmt.Println("</ul>")
        }

        if ol.MatchString(para2) {
            if !nlist {
                nlist = true
                fmt.Println("<ol>")
            }
            para2 = strings.TrimSpace(para2[2:])
            fmt.Printf("  <li>%s</li>\n", para2)
            continue
        } else if nlist {
            nlist = false
            fmt.Println("</ol>")
        }

        if !blist && !nlist {
            fmt.Printf("<p>%s</p>\n", para2)
        }
    }
    if blist {
        fmt.Println("</ul>")
    }
    if nlist {
        fmt.Println("</ol>")
    }
    fmt.Println("</body>")
    fmt.Println("</html>")
}
Output:
<html>
<head><title>Sample Text</title></head>
<body>
<p>This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.</p>
<ul>
  <li>This is a bulleted list with a less than sign (&lt;)</li>
  <li>And this is its second line with a greater than sign (&gt;)</li>
</ul>
<p>A &#39;normal&#39; paragraph between the lists.</p>
<ol>
  <li>This is a numbered list with an ampersand (&amp;)</li>
  <li>&#34;Second line&#34; in double quotes</li>
  <li>&#39;Third line&#39; in single quotes</li>
</ol>
<p>That&#39;s all folks.</p>
</body>
</html>

Java

public final class TextToHTML {

	public static void main(String[] args) {
		String sampleText = """
			    Sample Text
			This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.
			* This is a bulleted list with a less than sign (<)
			* And this is its second line with a greater than sign (>)
			A 'normal' paragraph between the lists.
			1. This is a numbered list with an ampersand (&)
			2. "Second line" in double quotes
			3. 'Third line' in single quotes
			That's all folks.
		""";		
		
		boolean bulletedList = false;
		boolean numberedList = false;	
		List<String> paragraphs = splitParagraphs(escapeHTML(sampleText));
		String title = Character.isWhitespace(paragraphs.getFirst().charAt(0)) ?
			paragraphs.getFirst().strip() : "Untitled";		

		System.out.println("<html>");
		System.out.println("<head><title>" + title + "</title></head>");
		System.out.println("<body>");
		
		for ( int i = 1; i < paragraphs.size(); i++ ) {
		    String paragraph = paragraphs.get(i).strip();
		    
		    if ( paragraph.startsWith("*") ) {
		        if ( ! bulletedList ) {
		            bulletedList = true;
		            System.out.println("<ul>");
		        }
		        System.out.println("  <li>" + paragraph.substring(1).strip() + "</li>");
		    } else if ( bulletedList ) {
		        bulletedList = false;
		        System.out.println("</ul>");
		    }
	    
		    if ( Character.isDigit(paragraph.charAt(0)) && paragraph.charAt(1) == '.' ) {
		       if ( ! numberedList ) { 
		            numberedList = true;
		            System.out.println("<ol>");
		       }
		       System.out.println("  <li>" + paragraph.substring(2).strip() + "</li>");
		    } else if ( numberedList ) {
		        numberedList = false;
		        System.out.println("</ol>");
		    }
		    
		    if ( ! bulletedList && ! numberedList ) {
		    	System.out.println("<p>" + paragraph.strip() + "</p>");
		    }   
		}
		
		if ( bulletedList ) {
			System.out.println("</ul>");
		}
		if ( numberedList ) {
			System.out.println("</ol>");
		}
		System.out.println("</body>");
		System.out.println("</html>");
	}
	
	private static List<String> splitParagraphs(String text) {
	    List<String> paragraphs = new ArrayList<String>();
	    StringBuilder builder = new StringBuilder();
	    
	    for ( char ch : text.toCharArray() ) {
	        if ( ch == '\n' ) {
	        	if ( ! builder.toString().isBlank() ) {
	                paragraphs.addLast(builder.toString());
	                builder.setLength(0);
	        	}
	        } else {
	            builder.append(ch);
	        }
	    }
	    
	    if ( ! builder.toString().isBlank() ) {
	        paragraphs.addLast(builder.toString());
	    }
	    return paragraphs;
	}
	
	private static String escapeHTML(String text) {
	    String result = text;
	    return result.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;");
	}
	
}
Output:
<html>
<head><title>Sample Text</title></head>
<body>
<p>This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.</p>
<ul>
  <li>This is a bulleted list with a less than sign (<)</li>
  <li>And this is its second line with a greater than sign (>)</li>
</ul>
<p>A 'normal' paragraph between the lists.</p>
<ol>
  <li>This is a numbered list with an ampersand (&)</li>
  <li>"Second line" in double quotes</li>
  <li>'Third line' in single quotes</li>
</ol>
<p>That's all folks.</p>
</body>
</html>

Julia

Translation of: Go
using HttpCommon, Printf

const exampletxt = """            Sample Text

This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.

* This is a bulleted list with a less than sign (<)

* And this is its second line with a greater than sign (>)

A 'normal' paragraph between the lists.

1. This is a numbered list with an ampersand (&)

2. "Second line" in double quotes

3. 'Third line' in single quotes

That's all folks."""

function txt_to_html(t = exampletxt)
    p = r"\n\s*(\n\s*)+"
    ul = r"^\*"
    ol = r"^\d\."
    paras = map(p -> escapeHTML(string(p)), split(t, r"[\r\n]+"))
    # Assume if first character of first paragraph is white-space
    # then it's probably a document title.
    firstchar = first(first(paras))
    title = "Untitled"
    k = 1
    if firstchar == ' ' || firstchar == '\t'
        title = strip(paras[1])
        k = 2
    end
    println("<html>")
    @printf("<head><title>%s</title></head>\n", title)
    println("<body>")

    blist, nlist = false, false
    for para in @view paras[k:end]
        para2 = strip(para)

        if occursin(ul, para2)
            if !blist
                blist = true
                println("<ul>")
            end
            para2 = strip(para2[2:end])
            @printf("  <li>%s</li>\n", para2)
            continue
        elseif blist
            blist = false
            println("</ul>")
        end

        if occursin(ol, para2)
            if !nlist
                nlist = true
                println("<ol>")
            end
            para2 = strip(para2[3:end])
            @printf("  <li>%s</li>\n", para2)
            continue
        elseif nlist
            nlist = false
            println("</ol>")
        end

        if !blist && !nlist
            @printf("<p>%s</p>\n", para2)
        end
    end
    if blist
        println("</ul>")
    end
    if nlist
        println("</ol>")
    end
    println("</body>")
    println("</html>")
end

txt_to_html()
Output:
<html>
<head><title>Sample Text</title></head>
<body>
<p>This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.</p>
<ul>
  <li>This is a bulleted list with a less than sign (<)</li>
  <li>And this is its second line with a greater than sign (>)</li>
</ul>
<p>A 'normal' paragraph between the lists.</p>
<ol>
  <li>This is a numbered list with an ampersand (&)</li>
  <li>"Second line" in double quotes</li>
  <li>'Third line' in single quotes</li>
</ol>
<p>That's all folks.</p>
</body>
</html>

M2000 Interpreter

Module CheckIt {
	d$={        Sample Text
		
		This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.
		
		 * This is a bulleted list with a less than sign (<)
		
		   * And this is its second line with a greater than sign (>)
		
		A 'normal' paragraph between the lists. 
		
		1. This is a numbered list with an ampersand (&)
		
		2. "Second line" in double quotes
		
		3. 'Third line' in single quotes
		
		That's all folks.
		}
	
	dim par()
	nl={
	}
	d$+=nl
	do k=len(d$)
		d$=replace$("  ", " ", d$)
	until k=len(d$)
	do k=len(d$)
		d$=replace$(nl+" ", nl, d$)
	until k=len(d$)
	do k=len(d$)
		d$=replace$(nl+nl, nl, d$)
	until k=len(d$)
	d$=left$(d$, len(d$)-len(nl))
	let par()=piece$(@escapeHTML(d$), nl)
	endline=len(par())+1
	dim par(1 to endline)
	string t={<html>
	<head>
	<title>+++</title>
	</head>
	<body>
	}
	par(1)=replace$("+++", par(1), t)
	par(endline)={</body>
	</html>
	}
	flush
	boolean onelevel_list, onelevel_numeric_list
	for i=1 to endline
		select case left$(par(i),1)
		case "*"
		{ CheckFlags(1)
			data "<li>"+ltrim$(mid$(par(i), 2))+"</li>"
		}
		case "<"
		{ CheckFlags(0)
			while right$(par(i), 2)=nl
			par(i)=left$(par(i), len(par(i))-2)
			end while	
			data par(i)
		}
		case "1" to "9"
		{ CheckFlags(2)
			m=0
			j=val(par(i), 1033, m)
			data "<li>"+ltrim$(mid$(par(i), m+1))+"</li>"
		}
		case else
		{ CheckFlags(0)
			data "<p>"+par(i)+"</p>"
		}
		end select
	next
	Document doc$
	if not empty then Doc$=letter$
	while not empty
		Doc$=nl+letter$
	end while
	Report Doc$
	Clipboard Doc$
	const UTF=2, UTF_no_BOM=-2
	' insert a BOM
	const CRLF=0, LF=10, CR=10
	' CRLF is ok
	Save.Doc Doc$, "this.html", UTF+CRLF
	' Open the default browser
	Win file.app$("html"), dir$+"this.html"
	End
	Sub CheckFlags(c as byte)
		if c=1 then
			if onelevel_list else
				data "<ul>"
				onelevel_list=true
			end if
		else.if c=2 then
			if onelevel_numeric_list else
				data "<ol>"
				onelevel_numeric_list=true
			end if
		else
			if onelevel_list then
				data "</ul>"
				onelevel_list=false
			end if
			if onelevel_numeric_list then
				data "</ol>"
				onelevel_numeric_list=false
			end if		
		end if
	End Sub
	Function escapeHTML(text As String)
	    Local String result = text
	    result = replace$( "&", "&amp;", result)
	    result = replace$( "<", "&lt;", result)
	    result = replace$( ">", "&gt;", result)
	    result = replace$( """", "&quot;", result)
	    result = replace$( "'", "&#39;", result)
	    = result
	End Function
}
CheckIt
Output:
<html>
<head>
<title> Sample Text</title>
</head>
<body>
<p>This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.</p>
<ul>
<li>This is a bulleted list with a less than sign (&lt;)</li>
<li>And this is its second line with a greater than sign (&gt;)</li>
</ul>
<p>A &#39;normal&#39; paragraph between the lists. </p>
<ol>
<li>This is a numbered list with an ampersand (&amp;)</li>
<li>&quot;Second line&quot; in double quotes</li>
<li>&#39;Third line&#39; in single quotes</li>
</ol>
<p>That&#39;s all folks.</p>
</body>
</html>

Nim

Translation of: Go
import re, strutils, xmltree

const Text = """     Sample Text

This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.

* This is a bulleted list with a less than sign (<)

* And this is its second line with a greater than sign (>)

A 'normal' paragraph between the lists.

1. This is a numbered list with an ampersand (&)

2. "Second line" in double quotes

3. 'Third line' in single quotes

That's all folks."""


let p = re"\n\s*(\n\s*)+"
let ul = re"^\*"
let ol = re"^\d\."
let text = xmltree.escape(Text)
let paras = text.split(p)

# Assume if first character of first paragraph is white-space
# then it's probably a document title.
let firstChar = paras[0][0]
var titleString = "untitled"
var start = 0
if firstChar.isSpaceAscii:
  titleString = paras[0].strip()
  start = 1
echo "<html>"
echo "<head><title>", titleString, "</title></body>"
echo "<body>"

var blist, nlist = false
for ipara in start..paras.high:
  var para = paras[ipara].strip()

  if para.find(ul) >= 0:
    if not blist:
      blist = true
      echo "<ul>"
    echo "  <li>", para[1..^1].strip(), "</li>"
    continue
  elif blist:
    blist = false
    echo "</ul>"

  if para.find(ol) >= 0:
    if not nlist:
      nlist = true
      echo "<ol>"
    echo "  <li>", para[2..^1].strip(), "</li>"
    continue
  elif nlist:
    nlist = false
    echo "</ol>"

  if not (blist or nlist):
    echo "<p>", para, "</p>"

if blist: echo "</ul>"
if nlist: echo "</ol>"

echo "</body>"
echo "</html>"
Output:
<html>
<head><title>Sample Text</title></body>
<body>
<p>This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.</p>
<ul>
  <li>This is a bulleted list with a less than sign (<)</li>
  <li>And this is its second line with a greater than sign (>)</li>
</ul>
<p>A 'normal' paragraph between the lists.</p>
<ol>
  <li>This is a numbered list with an ampersand (&)</li>
  <li>"Second line" in double quotes</li>
  <li>'Third line' in single quotes</li>
</ol>
<p>That's all folks.</p>
</body>
</html>

Perl

Translation of: Raku
# 20201023 added Perl programming solution

use strict;
use warnings;

use Pod::Simple::HTML;

# POD example taken from https://juerd.nl/site.plp/perlpodtut
my $pod = <<'POD';
=head1 NAME

My::Module - An example module

=head1 SYNOPSIS

    use My::Module;
    my $object = My::Module->new();
    print $object->as_string;

=head1 DESCRIPTION

This module does not really exist, it
was made for the sole purpose of
demonstrating how POD works.

=head2 Methods

=over 12

=item C<new>

Returns a new My::Module object.

=item C<as_string>

Returns a stringified representation of
the object. This is mainly for debugging
purposes.

=back

=head1 LICENSE

This is released under the Artistic
License. See L<perlartistic>.

=head1 AUTHOR

Juerd - L<http://juerd.nl/>

=head1 SEE ALSO

L<perlpod>, L<perlpodspec>

=cut
POD

my $parser = Pod::Simple::HTML->new();
$parser->output_fh(*STDOUT);
$parser->parse_string_document($pod)

Phix

The best thing to do here is to keep it utterly trivial.

with javascript_semantics
constant {hchars,hsubs} = columnize({{"&","&amp;"},
                                     {"<","&lt;"},
                                     {">","&gt;"},
                                     {"\"","&dquo;"},
                                     {"\'","&squo;"}})
 
constant fmt = """
<html>
<head><title>%s</title></head>
<body>
<pre>
%s
</pre>
</body>
</html>
"""
 
function text_to_html_page(string title, text)
    title = substitute_all(title,hchars,hsubs)
    text = substitute_all(text,hchars,hsubs)
    return sprintf(fmt,{title,text})
--  return substitute_all(sprintf(fmt,{title,text}),hchars,hsubs)
end function
 
constant text = """
  This is
  a paragraph
 
      a block of
      code
 
  * A one-bullet list
    > With quoted text
    >
    >     and code
"""
 
puts(1,text_to_html_page("my title",text))
Output:

The last line of text_to_html() (as commented out) was used to generate the sanitised version of the output, as needed for inclusion on this page.

<html>
<head><title>my title</title></head>
<body>
<pre>
  This is
  a paragraph

      a block of
      code

  * A one-bullet list
    &gt; With quoted text
    &gt;
    &gt;     and code

</pre>
</body>
</html>

Pike

algorithm:

  • split by line
  • find average line length to identify centered lines
  • find isolated lines to identify section headings
  • find URIs
  • identify section numbering
  • identify bullet and numbered lists
  • identify paragraphs
  • identify indented lines
  • if possible identify tables

to ensure valid xhtml create a nested structure:

  • create an xml node
  • add elements to node
  • add lines to element if appropriate

this implementation is still incomplete.

// function to calculate the average line length (not used yet below)
int linelength(array lines)
{ 
    array sizes = sizeof(lines[*])-({0}); 
    sizes = sort(sizes); 

    // only consider the larger half of lines minus the top 5%
    array larger = sizes[sizeof(sizes)/2..sizeof(sizes)-sizeof(sizes)/20];

    int averagelarger = `+(@larger)/sizeof(larger);
    return averagelarger; 
}

array mark_up(array lines)
{
    array markup = ({});

    // find special lines
    foreach(lines; int index; string line)
    {
        string strippedline = String.trim_whites(line);
        if (sizeof(strippedline))
        {
            string firstchar = strippedline[0..0];
            int pos = search(line, firstchar);

            if (lines[index-1]-" "-"\t" =="" && lines[index+1]-" "-"\t" =="")
                markup +=({ ({ "heading", strippedline, pos }) });
            else if (firstchar == "*")
                markup += ({ ({ "bullet", strippedline, pos }) });
            else if ( (<"0","1","2","3","4","5","6","7","8","9">)[firstchar] )
                markup += ({ ({ "number", strippedline, pos }) });
            else if (pos > 0)
                markup += ({ ({ "indent", strippedline, pos }) });
            else            
                markup += ({ ({ "regular", strippedline, pos }) });
        }
        else markup += ({ ({ "empty" }) });
    }

    foreach(markup; int index; array line)
    {
        if (index > 0 && index < sizeof(markup)-1 )
        {
            if (line[0] == "regular" && markup[index-1][0] != "regular" && markup[index+1][0] != "regular")
                line[0] = "heading";
        }
    }

    //find paragraphs
    foreach(markup; int index; array line)
    {
        if (index > 0 && index < sizeof(markup)-1 )
        {
            if (line[0] == "empty" && markup[index-1][0] == "regular" && markup[index+1][0] == "regular")
                line[0] = "new paragraph";
            else if (line[0] == "empty" && markup[index-1][0] == "regular" && markup[index+1][0] != "regular")
                line[0] = "end paragraph";
            else if (line[0] == "empty" && markup[index-1][0] != "regular" && markup[index+1][0] == "regular")
                line[0] = "begin paragraph";
        }
    }
    return markup;
}

object make_tree(array markup)
{
    object root = Parser.XML.Tree.SimpleRootNode(); 
    object newline = Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_TEXT, "", ([]), "\n");
    array current = ({ Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_ELEMENT, "div", ([]), "") });
    root->add_child(current[-1]);

    foreach (markup; int index; array line)
    {
        switch(line[0])
        {
            case "heading": 
                      current[-1]->add_child(newline);
                      object h = Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_ELEMENT, "h3", ([]), "");
                      h->add_child(Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_TEXT, "", ([]), line[1]));
                      current[-1]->add_child(h);
                      current[-1]->add_child(newline);
                  break;
            case "bullet":
            case "number":
                      if (current[-1]->get_tag_name() == "li")
                          current = Array.pop(current)[1];
                      current[-1]->add_child(newline);
                      object li = Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_ELEMENT, "li", ([]), "");
                      li->add_child(Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_TEXT, "", ([]), line[1]));
                      current[-1]->add_child(li);
                      current = Array.push(current, li);
                  break;
            case "indent":
                      if (markup[index-1][0] != "bullet" && markup[index-1][0] != "number")
                          current = Array.pop(current)[1];
                      current[-1]->add_child(Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_TEXT, "", ([]), line[1]));
                  break;
            case "new paragraph":
                      current = Array.pop(current)[1];
                      current[-1]->add_child(newline);
            case "begin paragraph":
                      object p = Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_ELEMENT, "p", ([]), "");
                      current[-1]->add_child(p); 
                      current = Array.push(current, p);
                 break;
            case "end paragraph":
                      current = Array.pop(current)[1];
                      current[-1]->add_child(newline);
                 break;
            case "regular":           
                      current[-1]->add_child(Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_TEXT, "", ([]), line[1]));
            case "empty": 
                  break;
        } 
    }   
    return root;
}

Racket

This task seems like it's very under-defined, but the discussion seems to be headed towards a simple markdown specification. I therefore do this with a small interface to cmark to render commonmark text.

(Note that this is not some cooked code, it's coming from code that I'm using to render class notes, and hopefully it will be useful to have such an example here. It certainly seems to me as a useful thing compared to some half-baked not-really-markdown-or-anything implementation.)

#lang at-exp racket

(require ffi/unsafe ffi/unsafe/define)

(define-ffi-definer defcmark (ffi-lib "libcmark"))

(define _cmark_opts
  (_bitmask '(sourcepos = 1 hardbreaks = 2 normalize = 4 smart = 8)))
(define-cpointer-type _node)
(defcmark cmark_markdown_to_html
  (_fun [bs : _bytes] [_int = (bytes-length bs)] _cmark_opts
        -> [r : _bytes] -> (begin0 (bytes->string/utf-8 r) (free r))))

(define (cmark-markdown-to-html #:options [opts '(normalize smart)] . text)
    (cmark_markdown_to_html (string->bytes/utf-8 (string-append* text)) opts))

(display @cmark-markdown-to-html{
  This is
  a paragraph

      a block of
      code

  * A one-bullet list
    > With quoted text
    >
    >     and code
})
Output:
<p>This is
a paragraph</p>
<pre><code>a block of
code
</code></pre>
<ul>
<li>A one-bullet list
<blockquote>
<p>With quoted text</p>
<pre><code>and code
</code></pre>
</blockquote>
</li>
</ul>

Raku

(formerly Perl 6)

Works with: Rakudo version 2019.11

The task specs are essentially non-existent. "Make a best guess at how to render mark-up free text"? Anything that could be trusted at all would either be extremely trivial or insanely complex. And it shows by the the task example writers staying away in droves. Five examples after seven years!?

Rather than waste time on that noise, I'll demonstrate POD6 to HTML conversion. POD6 is a simple, text-only mark-up used for Raku documentation. (It's Plain Old Documentation for Raku) It uses pretty simple textual markup and has multiple tools to convert the POD6 document in to many other formats, HTML among them.

It is not markup free, but it is actually usable in production.

use Pod::To::HTML;
use HTML::Escape;

my $pod6 = q:to/POD6/;
=begin pod

A very simple Pod6 document.

This is a very high-level, hand-wavey overview. There are I<lots> of other
options available.

=head1 Section headings

=head1 A top level heading

=head2 A second level heading

=head3 A third level heading

=head4 A fourth level heading

=head1 Text

Ordinary paragraphs do not require an explicit marker or delimiters.

Alternatively, there is also an explicit =para marker that can be used to
explicitly mark a paragraph.

=para
This is an ordinary paragraph.
Its text  will   be     squeezed     and
short lines filled.

=head1 Code

Enclose code in a =code block (or V<C< >> markup for short, inline samples)

=begin code
    my $name = 'Rakudo';
    say $name;
=end code

=head1 Lists

=head3 Unordered lists

=item  Grumpy
=item  Dopey
=item  Doc
=item  Happy
=item  Bashful
=item  Sneezy
=item  Sleepy

=head3 Multi-level lists

=item1  Animal
=item2  Vertebrate
=item2  Invertebrate

=item1  Phase
=item2  Solid
=item2  Liquid
=item2  Gas

=head1 Formatting codes

Formatting codes provide a way to add inline mark-up to a piece of text.

All Pod6 formatting codes consist of a single capital letter followed
immediately by a set of single or double angle brackets; Unicode double angle
brackets may be used.

Formatting codes may nest other formatting codes.

There are many formatting codes available, some of the more common ones:

=item1 V<B< >> Bold
=item1 V<I< >> Italic
=item1 V<U< >> Underline
=item1 V<C< >> Code
=item1 V<L< >> Hyperlink
=item1 V<V< >> Verbatim (Don't interpret anything inside as POD markup)

=head1 Tables

There is quite extensive markup to allow rendering tables.

A simple example:

=begin table :caption<Mystery Men>
        The Shoveller   Eddie Stevens     King Arthur's singing shovel
        Blue Raja       Geoffrey Smith    Master of cutlery
        Mr Furious      Roy Orson         Ticking time bomb of fury
        The Bowler      Carol Pinnsler    Haunted bowling ball
=end table

=end pod
POD6

# for display on Rosettacode
say escape-html render($pod6);

# normally
#say render($pod6);
Returns something like:
<!doctype html>
<html lang="en">
    <head>
        <title></title>
        <meta charset="UTF-8" />
        <style>
        kbd { font-family: "Droid Sans Mono", "Luxi Mono", "Inconsolata", monospace }
        samp { font-family: "Terminus", "Courier", "Lucida Console", monospace }
        u { text-decoration: none }
        .nested {
            margin-left: 3em;
        }
        aside, u { opacity: 0.7 }
        a[id^="fn-"]:target { background: #ff0 }
        </style>
        <link rel="stylesheet" href="//design.raku.org/perl.css">

    </head>
    <body class="pod">
    <div id="___top"></div>

    <nav class="indexgroup">
<table id="TOC">
<caption><h2 id="TOC_Title">Table of Contents</h2></caption>
    <tr class="toc-level-1"><td class="toc-number">1</td><td class="toc-text"><a href="#Section_headings">Section headings</a></td></tr>
 <tr class="toc-level-1"><td class="toc-number">2</td><td class="toc-text"><a href="#A_top_level_heading">A top level heading</a></td></tr>
 <tr class="toc-level-2"><td class="toc-number">2.1</td><td class="toc-text"><a href="#A_second_level_heading">A second level heading</a></td></tr>
 <tr class="toc-level-3"><td class="toc-number">2.1.1</td><td class="toc-text"><a href="#A_third_level_heading">A third level heading</a></td></tr>
 <tr class="toc-level-4"><td class="toc-number">2.1.1.1</td><td class="toc-text"><a href="#A_fourth_level_heading">A fourth level heading</a></td></tr>
 <tr class="toc-level-1"><td class="toc-number">3</td><td class="toc-text"><a href="#Text">Text</a></td></tr>
    <tr class="toc-level-1"><td class="toc-number">4</td><td class="toc-text"><a href="#Code">Code</a></td></tr>
      <tr class="toc-level-1"><td class="toc-number">5</td><td class="toc-text"><a href="#Lists">Lists</a></td></tr>
 <tr class="toc-level-3"><td class="toc-number">5.0.1</td><td class="toc-text"><a href="#Unordered_lists">Unordered lists</a></td></tr>
        <tr class="toc-level-3"><td class="toc-number">5.0.2</td><td class="toc-text"><a href="#Multi-level_lists">Multi-level lists</a></td></tr>
        <tr class="toc-level-1"><td class="toc-number">6</td><td class="toc-text"><a href="#Formatting_codes">Formatting codes</a></td></tr>
           <tr class="toc-level-1"><td class="toc-number">7</td><td class="toc-text"><a href="#Tables">Tables</a></td></tr>
              
</table>
</nav>

    <div class="pod-body">
    <p>A very simple Pod6 document.</p>
<p>This is a very high-level, hand-wavey overview. There are <em>lots</em> of other options available.</p>
<h1 id="Section_headings"><a class="u" href="#___top" title="go to top of document">Section headings</a></h1>
<h1 id="A_top_level_heading"><a class="u" href="#___top" title="go to top of document">A top level heading</a></h1>
<h2 id="A_second_level_heading"><a class="u" href="#___top" title="go to top of document">A second level heading</a></h2>
<h3 id="A_third_level_heading"><a class="u" href="#___top" title="go to top of document">A third level heading</a></h3>
<h4 id="A_fourth_level_heading"><a class="u" href="#___top" title="go to top of document">A fourth level heading</a></h4>
<h1 id="Text"><a class="u" href="#___top" title="go to top of document">Text</a></h1>
<p>Ordinary paragraphs do not require an explicit marker or delimiters.</p>
<p>Alternatively, there is also an explicit =para marker that can be used to explicitly mark a paragraph.</p>
<p>This is an ordinary paragraph. Its text will be squeezed and short lines filled.</p>
<h1 id="Code"><a class="u" href="#___top" title="go to top of document">Code</a></h1>
<p>Enclose code in a =code block (or C&lt; &gt; markup for short, inline samples)</p>
<pre class="pod-block-code">    my $name = &#39;Rakudo&#39;;
    say $name;
</pre>
<h1 id="Lists"><a class="u" href="#___top" title="go to top of document">Lists</a></h1>
<h3 id="Unordered_lists"><a class="u" href="#___top" title="go to top of document">Unordered lists</a></h3>
<ul><li><p>Grumpy</p>
</li>
<li><p>Dopey</p>
</li>
<li><p>Doc</p>
</li>
<li><p>Happy</p>
</li>
<li><p>Bashful</p>
</li>
<li><p>Sneezy</p>
</li>
<li><p>Sleepy</p>
</li>
</ul>
<h3 id="Multi-level_lists"><a class="u" href="#___top" title="go to top of document">Multi-level lists</a></h3>
<ul><li><p>Animal</p>
</li>
<ul><li><p>Vertebrate</p>
</li>
<li><p>Invertebrate</p>
</li>
</ul>
<li><p>Phase</p>
</li>
<ul><li><p>Solid</p>
</li>
<li><p>Liquid</p>
</li>
<li><p>Gas</p>
</li>
</ul>
</ul>
<h1 id="Formatting_codes"><a class="u" href="#___top" title="go to top of document">Formatting codes</a></h1>
<p>Formatting codes provide a way to add inline mark-up to a piece of text.</p>
<p>All Pod6 formatting codes consist of a single capital letter followed immediately by a set of single or double angle brackets; Unicode double angle brackets may be used.</p>
<p>Formatting codes may nest other formatting codes.</p>
<p>There are many formatting codes available, some of the more common ones:</p>
<ul><li><p>B&lt; &gt; Bold</p>
</li>
<li><p>I&lt; &gt; Italic</p>
</li>
<li><p>U&lt; &gt; Underline</p>
</li>
<li><p>C&lt; &gt; Code</p>
</li>
<li><p>L&lt; &gt; Hyperlink</p>
</li>
<li><p>V&lt; &gt; Verbatim (Don&#39;t interpret anything inside as POD markup)</p>
</li>
</ul>
<h1 id="Tables"><a class="u" href="#___top" title="go to top of document">Tables</a></h1>
<p>There is quite extensive markup to allow rendering tables.</p>
<p>A simple example:</p>
<table class="pod-table">
<caption>Mystery Men</caption>
<tbody>
<tr> <td>The Shoveller</td> <td>Eddie Stevens</td> <td>King Arthur&#39;s singing shovel</td> </tr> <tr> <td>Blue Raja</td> <td>Geoffrey Smith</td> <td>Master of cutlery</td> </tr> <tr> <td>Mr Furious</td> <td>Roy Orson</td> <td>Ticking time bomb of fury</td> </tr> <tr> <td>The Bowler</td> <td>Carol Pinnsler</td> <td>Haunted bowling ball</td> </tr>
</tbody>
</table>
    </div>

    </body>
</html>

Tcl

This renderer doesn't do all that much. Indeed, it deliberately avoids doing all the complexity that is possible; instead it seeks to just provide the minimum that could possibly be useful to someone who is doing very simple text pages.

package require Tcl 8.5

proc splitParagraphs {text} {
    split [regsub -all {\n\s*(\n\s*)+} [string trim $text] \u0000] "\u0000"
}
proc determineParagraph {para} {
    set para [regsub -all {\s*\n\s*} $para " "]
    switch -regexp -- $para {
	{^\s*\*+\s} {
	    return [list ul [string trimleft $para " \t*"]]
	}
	{^\s*\d+\.\s} {
	    set para [string trimleft $para " \t\n0123456789"]
	    set para [string range $para 1 end]
	    return [list ol [string trimleft $para " \t"]]
	}
	{^#+\s} {
	    return [list heading [string trimleft $para " \t#"]]
	}
    }
    return [list normal $para]
}
proc markupParagraphContent {para} {
    set para [string map {& &amp; < &lt; > &gt;} $para]
    regsub -all {_([\w&;]+)_} $para {<i>\1</i>} para
    regsub -all {\*([\w&;]+)\*} $para {<b>\1</b>} para
    regsub -all {`([\w&;]+)`} $para {<tt>\1</tt>} para
    return $para
}

proc markupText {title text} {
    set title [string map {& &amp; < &lt; > &gt;} $title]
    set result "<html>"
    append result "<head><title>" $title "</title>\n</head>"
    append result "<body>" "<h1>$title</h1>\n"
    set state normal
    foreach para [splitParagraphs $text] {
	lassign [determineParagraph $para] type para
	set para [markupParagraphContent $para]
	switch $state,$type {
	    normal,normal {append result "<p>" $para "</p>\n"}
	    normal,heading {
		append result "<h2>" $para "</h2>\n"
		set type normal
	    }
	    normal,ol {append result "<ol>" "<li>" $para "</li>\n"}
	    normal,ul {append result "<ul>" "<li>" $para "</li>\n"}

	    ul,normal {append result "</ul>" "<p>" $para "</p>\n"}
	    ul,heading {
		append result "</ul>" "<h2>" $para "</h2>\n"
		set type normal
	    }
	    ul,ol {append result "</ul>" "<ol>" "<li>" $para "</li>\n"}
	    ul,ul {append result "<li>" $para "</li>\n"}

	    ol,normal {append result "</ol>" "<p>" $para "</p>\n"}
	    ol,heading {
		append result "</ol>" "<h2>" $para "</h2>\n"
		set type normal
	    }
	    ol,ol {append result "<li>" $para "</li>\n"}
	    ol,ul {append result "</ol>" "<ul>" "<li>" $para "</li>\n"}
	}
	set state $type
    }
    if {$state ne "normal"} {
	append result "</$state>"
    }
    return [append result "</body></html>"]
}

Here's an example of how it would be used.

set sample "
This is an example of how a pseudo-markdown-ish formatting scheme could
work. It's really much simpler than markdown, but does support a few things.

# Block paragraph types

* This is a bulleted list

* And this is the second item in it

1. Here's a numbered list

2. Second item

3. Third item

# Inline formatting types

The formatter can render text with _italics_, *bold* and in a `typewriter`
font. It also does the right thing with <angle brackets> and &amp;ersands,
but relies on the encoding of the characters to be conveyed separately."

puts [markupText "Sample" $sample]
Output:
<html><head><title>Sample</title>
</head><body><h1>Sample</h1>
<p>This is an example of how a pseudo-markdown-ish formatting scheme could work. It's really much simpler than markdown, but does support a few things.</p>
<h2>Block paragraph types</h2>
<ul><li>This is a bulleted list</li>
<li>And this is the second item in it</li>
</ul><ol><li>Here's a numbered list</li>
<li>Second item</li>
<li>Third item</li>
</ol><h2>Inline formatting types</h2>
<p>The formatter can render text with <i>italics</i>, <b>bold</b> and in a <tt>typewriter</tt> font. It also does the right thing with &lt;angle brackets&gt; and &amp;amp;ersands, but relies on the encoding of the characters to be conveyed separately.</p>
</body></html>

Vim Script

The problem description is quite open-ended, so this example considers the following as criteria for this Vim Script solution:

  • The initial line has the title, which will also be treated as heading level 1 and centred.
  • Centred lines (i.e., preceded by more than one space) will be treated as heading level 2 and also centred.
  • There is no markup (as you would see in Markdown, Asciidoc, or other light markup languages). However, this excludes...
  • Bulleted and numbered lists, which are determined by lines starting with asterisk-space and numeral-period-space respectively (as you would expect in "plain text").
  • Tables in the input are identified by text delimited by tab characters (in contiguous lines), with the first line treated as the table's header.
  • Since the output is XHTML, (a) The XML declaration, DOCTYPE, and XML namespace should be as per XHTML 1.0 Strict, and (b) XML predefined entities should be used where appropriate, i.e., &amp;, &apos;, &gt;, &lt;, and &quot; but if character references are in the text file already those should be left as-is.
  • Hypertext external links will be handled, and their content should be replicated in the main text.

Input file

                        Text to HTML using Vim Script

                                Introduction

This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.

                                   Lists

A 'normal' paragraph before a list.

* This is a bulleted list with a less than sign (<)

* And this is its second line with a greater than sign (>)

A 'normal' paragraph between the lists.

1. This is a numbered list with an ampersand (&), but DO NOT substitute the ampersands within character references like &#x1F606; (😆)

2. "Second line" in double quotes, with “smart” quotes

3. 'Third line' in single quotes, with ‘smart’ ones too, and

4. This, https://rosettacode.org/wiki/Text_to_HTML, is a URI.

                                   Tables

A normal paragraph before a table, which has been formulated with U+0009 tab characters:

Head cell 1	Head cell 2	Head cell 3
Row 2 col 1	Row 2 col 2	Row 2 col 3
Row 3 col 1	Row 3 col 2	Row 3 col 3
Row 4 col 1	Row 4 col 2	Row 4 col 3

The HTML output may be checked against https://validator.w3.org/check to validate that it is valid XHTML.

                                 Conclusion

That's all folks.

NB: &#x1F606; in the input file needed to have &amp; added to it to display correctly.

Vim Script (and running it) The following Vim Script has been written to be run from the command line with:

vim -c "source Text_to_HTML.vim" Text_to_HTML.xhtml

where:

  • Text_to_HTML.xhtml is the input file (a copy of the .txt file to convert), above, which will be overwritten by
  • Text_to_HTML.vim, the Vim Script, reproduced below.
" Substitute the XML predefined character entities
%s/&\ze\([^A-z#]\)/\&amp;/g
%s/>/\&gt;/g
%s/</\&lt;/g
%s/"/\&quot;/g
%s/'/\&apos;/g
" Substitute URIs: presumes ! $ & ' ( ) * + , ; = : will be %xx escaped
%s/http[s]\?:\/\/[A-z0-9._~:/-]\+\ze[^.:]/<a href="\0">\0<\/a>
" Substitute simple tables, which use tabs (U+0009)
%s/\([^\t]\+\t.\+\n\n\?\)\+/<table>\r\0<\/table>\r/
%s/\([^\t]\+\t.\+\n\n\?\)\+/<thead>\0<\/tbody>/
%s/\(<thead>\)\(.\+\)/\1\r<tr>\2<\/tr>\r<\/thead>\r<tbody>/
%s/^\([^<][^\t]\+\t.\+\)\n\n\?\(<\/tbody>\)/<tr>\1<\/tr>\r\2\r/
%s/^\([^<][^\t]\+\t.\+\)\n\n\?/<tr>\1<\/tr>\r/
%s/<tr>\zs.*\ze<\/tr>/\=substitute(submatch(0), '\t', '<\/td><td>', 'g')/g
%s/<tr>/&<td>/
%s/<\/tr>/<\/td>&/
" Substitute the unordered list items, and temporarily precede them with <!--ulx-->
%s/* \(.\+\)\n\n*/<!--ulx--><li>\1<\/li>\r/
" Substitute the ordered list items, and temporarily precede them with <!--olx-->
%s/\d[.] \(.\+\)\n\n*/<!--olx--><li>\1<\/li>\r/
" Clean up <!--olx--> contiguous lines, wrapping them in <ol>
%s/\(<!--olx--><li>.\+\n\)\+/<ol>\r&<\/ol>\r/
" Clean up <!--ulx--> contiguous lines, wrapping them in <ul>
%s/\(<!--ulx--><li>.\+\n\)\+/<ul>\r&<\/ul>\r/
" Clean up <!--?lx--> - remove the placeholder comment
%s/<!--.lx-->//g
" Add the XML declaration, XHTML strict DOCTYPE, <head> and <title> block (with <script> and CSS for the tables), putting the text within <title>...</title>
1s/\s\+\(.\+\)\n\n\?/<\?xml version="1.0" encoding="UTF-8"\?>\r<!DOCTYPE html PUBLIC "-\/\/W3C\/\/DTD XHTML 1.0 Strict\/\/EN" "http:\/\/www.w3.org\/TR\/xhtml1\/DTD\/xhtml1-strict.dtd">\r<html xmlns="http:\/\/www.w3.org\/1999\/xhtml" xml:lang="en" lang="en">\r<head><title>\1<\/title>\r<style type="text\/css">\rh1, h2 { font-weight: bold; text-align: center; }\rtable, th, td { border: 1px solid black; }\r<\/style>\r<\/head>\r<body>\r<h1>\1<\/h1>\r/
" Substitute paragraphs starting with space+ A-Z and wrap within a <h2>...</h2>
%s/^\s\+\([A-Z].\+\)\n/<h2>\1<\/h2>\r/
" Substitute paragraphs starting with A-Z and wrap within a <p>...</p>
%s/^\([A-Z].\+\)\n/<p>\1<\/p>\r/
" Add the </body> and </html> to the end of the buffer
$s/\n/&<\/body>\r<\/html>/
" Substitute double returns with single returns
%s/\n\n/\r/
" Write the file and quit Vim
wq!
Output:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head><title>Text to HTML using Vim Script</title>
<style type="text/css">
h1, h2 { font-weight: bold; text-align: center; }
table, th, td { border: 1px solid black; }
</style>
</head>
<body>
<h1>Text to HTML using Vim Script</h1>
<h2>Introduction</h2>
<p>This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.</p>
<h2>Lists</h2>
<p>A 'normal' paragraph before a list.</p>
<ul>
<li>This is a bulleted list with a less than sign (<)</li>
<li>And this is its second line with a greater than sign (>)</li>
</ul>
<p>A 'normal' paragraph between the lists.</p>
<ol>
<li>This is a numbered list with an ampersand (&), but DO NOT substitute the ampersands within character references like &#x1F606; (😆)</li>
<li>"Second line" in double quotes, with “smart” quotes</li>
<li>'Third line' in single quotes, with ‘smart’ ones too, and</li>
<li>This, <a href="https://rosettacode.org/wiki/Text_to_HTML">https://rosettacode.org/wiki/Text_to_HTML</a>, is a URI.</li>
</ol>
<h2>Tables</h2>
<p>A normal paragraph before a table, which has been formulated with U+0009 tab characters:</p>
<table>
<thead>
<tr><td>Head cell 1</td><td>Head cell 2</td><td>Head cell 3</td></tr>
</thead>
<tbody>
<tr><td>Row 2 col 1</td><td>Row 2 col 2</td><td>Row 2 col 3</td></tr>
<tr><td>Row 3 col 1</td><td>Row 3 col 2</td><td>Row 3 col 3</td></tr>
<tr><td>Row 4 col 1</td><td>Row 4 col 2</td><td>Row 4 col 3</td></tr>
</tbody>
</table>
<p>The HTML output may be checked against <a href="https://validator.w3.org/check">https://validator.w3.org/check</a> to validate that it is valid XHTML.</p>
<h2>Conclusion</h2>
<p>That's all folks.</p>
</body>
</html>

NB: Again, &#x1F606; in the output file needed to have &amp; added to it to display correctly.

This output validates (checked, as noted in the penultimate paragraph of the output, at https://validator.w3.org/check).

Wren

Translation of: Go
Library: Wren-pattern
import "./pattern" for Pattern

var t = """     Sample Text

This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.

* This is a bulleted list with a less than sign (<)

* And this is its second line with a greater than sign (>)

A 'normal' paragraph between the lists. 

1. This is a numbered list with an ampersand (&)

2. "Second line" in double quotes

3. 'Third line' in single quotes

That's all folks."""

// prefer the standard &quot; for escaping a double-quote character rather than Go's &#34;
var escapes = [ ["&", "&amp;"], ["<", "&lt;"], [">", "&gt;"], ["\"", "&quot;"], ["'", "&#39;"] ]
for (esc in escapes) t = t.replace(esc[0], esc[1])
var paras = t.split("\n\n")
var ol = Pattern.new("/d.", Pattern.start)

// Assume if first character of first paragraph is white-space
// then it's probably a document title.
var firstChar = paras[0][0]
var title = "Untitled"
var k = 0
if (firstChar == " " || firstChar == "\t") {
    title = paras[0].trim()
    k = 1
}
System.print("<html>")
System.print("<head><title>%(title)</title></head>")
System.print("<body>")

var blist = false
var nlist = false
for (para in paras.skip(k)) {
    var para2 = para.trim()
    var cont = false
    if (para2.startsWith("*")) {
        if (!blist) {
            blist = true
            System.print("<ul>")
        }
        para2 = para2[1..-1].trim()
        System.print("  <li>%(para2)</li>")
        cont = true
    } else if (blist) {
        blist = false
        System.print("</ul>")
    }

    if (!cont) {
        if (ol.isMatch(para2)) {
            if (!nlist) {
                nlist = true
                System.print("<ol>")
            }
            para2 = para2[2..-1].trim()
            System.print("  <li>%(para2)</li>")
            cont = true
        } else if (nlist) {
            nlist = false
            System.print("</ol>")
        }
        if (!cont && !blist && !nlist) System.print("<p>%(para2)</p>")
    }
}

if (blist) System.print("</ul>")
if (nlist) System.prin("</ol>")
System.print("</body>")
System.print("</html>")
Output:
<html>
<head><title>Sample Text</title></head>
<body>
<p>This is an example of converting plain text to HTML which demonstrates extracting a title and escaping certain characters within bulleted and numbered lists.</p>
<ul>
  <li>This is a bulleted list with a less than sign (<)</li>
  <li>And this is its second line with a greater than sign (>)</li>
</ul>
<p>A 'normal' paragraph between the lists.</p>
<ol>
  <li>This is a numbered list with an ampersand (&)</li>
  <li>"Second line" in double quotes</li>
  <li>'Third line' in single quotes</li>
</ol>
<p>That's all folks.</p>
</body>
</html>