Inspired by Julia solution, but not a translation actually.
<lang Nim>import random, sequtils, strutils, tables
from unicode import utf8
const StopChars = [".", "?", "!"]
proc weightedChoice(choices: CountTable[string]; totalCount: int): string =
## Return a string from "choices" key using counts as weights.
var n = rand(1..totalCount)
for word, count in choices.pairs:
dec n, count
if n <= 0: return word
assert false, "internal error"
proc finalFilter(words: seq[string]): seq[string] =
## Eliminate words of length one (except those of a given list)
## and words containing only uppercase letters (words from titles).
for word in words:
if word in [".", "?", "!", "I", "A", "a"]:
result.add word
elif word.len > 1 and any(word, isLowerAscii):
result.add word
var text = readFile("The War of the Worlds.txt")
# Extract the actual text from the book.
EndText = "End of the Project Gutenberg EBook"
let startIndex = text.find(StartText)
let endIndex = text.find(EndText)
text = text[startIndex..<endIndex]
# Clean the text by removing some characters and replacing others.
# As there are some non ASCII characters, we have to apply special rules.
var processedText: string
for uchar in text.utf8():
if uchar.len == 1:
# ASCII character.
let ch = uchar[0]
case ch
of '0'..'9', 'a'..'z', 'A'..'Z', ' ': processedText.add ch # Keep as is.
of '\n': processedText.add ' ' # Replace with a space.
of '.', '?', '!': processedText.add ' ' & ch # Make sure these characters are isolated.
else: discard # Remove others.
# Some UTF-8 representation of a non ASCII character.
case uchar
of "—": processedText.add ' ' # Replace EM DASH with space.
of "ç", "æ", "’": processedText.add uchar # Keep these characters as they are parts of words.
of "“", "”", "‘": discard # Removed these ones.
else: echo "encountered: ", uchar # Should not go here.
# Extract words and filter them.
let words = processedText.splitWhitespace().finalFilter()
# Build count tables.
var followCount, followCount2: Table[string, CountTable[string]]
for i in 1..words.high:
followCount.mgetOrPut(words[i - 1], initCountTable[string]()).inc(words[i])
for i in 2..words.high:
followCount2.mgetOrPut(words[i - 2] & ' ' & words[i - 1], initCountTable[string]()).inc words[i]
# Build sum tables.
var followSum, followSum2: CountTable[string]
for key in followCount.keys:
for count in followCount[key].values:
followSum.inc key, count
for key in followCount2.keys:
for count in followCount2[key].values:
followSum2.inc key, count
# Build table of starting words and compute the sum.
startingWords: CountTable[string]
startingSum: int
for stopChar in StopChars:
for word, count in followCount[stopChar].pairs:
startingWords.inc word, count
inc startingSum, count
# Build a sentence.
let firstWord = weightedChoice(startingWords, startingSum)
var sentence = @[firstWord]
var lastWord = weightedChoice(followCount[firstWord], followSum[firstWord])
while lastWord notin StopChars:
sentence.add lastWord
let key = sentence[^2] & ' ' & lastWord
lastWord = if key in followCount2:
weightedChoice(followCount2[key], followSum2[key])
weightedChoice(followCount[lastWord], followSum[lastWord])
echo sentence.join(" ") & lastWord</lang>
Here are some generated sentences. Short sentences are of course more likely to have a meaning.
<pre>But they won’t hunt us.
There was a whole population in movement.
The one had closed it.
Then he dropped his spade.
It’s out on the London valley.
I narrowly escaped an accident.
I assert that I had immediately to turn my attention first.
Halfway through the deserted village while the Martian approach.
I have no doubt they are mad with terror.
He told me no answer to that.
That’s how we shall save the race.
As if it had driven blindly straight at the group of soldiers to protect these strange creatures from Mars?
In the afternoon for the next day there was a carriage crashed into the parlour behind the engines going northward along the road.</pre>