I'm working on modernizing Rosetta Code's infrastructure. Starting with communications. Please accept this time-limited open invite to RC's Slack.. --Michael Mol (talk) 20:59, 30 May 2020 (UTC)

Random sentence from book

From Rosetta Code
Task
Random sentence from book
You are encouraged to solve this task according to the task description, using any language you may know.
  • Read in the book "The War of the Worlds", by H. G. Wells.
  • Skip to the start of the book, proper.
  • Remove extraneous punctuation, but keep at least sentence-ending punctuation characters . ! and ?
  • Keep account of what words follow words and how many times it is seen, (treat sentence terminators as words too).
  • Keep account of what words follow two words and how many times it is seen, (again treating sentence terminators as words too).
  • Assume that a sentence starts with a not to be shown full-stop character then use a weighted random choice of the possible words that may follow a full-stop to add to the sentence.
  • Then repeatedly add words to the sentence based on weighted random choices of what words my follow the last two words to extend the sentence.
  • Stop after adding a sentence ending punctuation character.
  • Tidy and then print the sentence.


Show examples of random sentences generated.

Related task


ALGOL 68[edit]

Works with: ALGOL 68G version Any - tested with release 2.8.3.win32
# generate random sentences using text from a book as a basis        #
 
# use the associative array in the Associate array/iteration task #
PR read "aArray.a68" PR
 
# returns s with chars removed #
PRIO REMOVE = 1;
OP REMOVE = ( STRING s, chars )STRING:
BEGIN
[ LWB s : UPB s ]CHAR result;
INT r pos := LWB result - 1;
FOR s pos FROM LWB s TO UPB s DO
IF NOT char in string( s[ s pos ], NIL, chars ) THEN
# have a character that needn't be removed #
r pos +:= 1;
result[ r pos ] := s[ s pos ]
FI
OD;
result[ LWB s : r pos ]
END # REMOVE # ;
# returns text converted to an INT or -1 if text is not a number #
OP TOINT = ( STRING text )INT:
BEGIN
INT result := 0;
BOOL is numeric := TRUE;
FOR ch pos FROM LWB text TO UPB text WHILE is numeric DO
CHAR c = text[ ch pos ];
is numeric := ( c >= "0" AND c <= "9" );
IF is numeric THEN ( result *:= 10 ) +:= ABS c - ABS "0" FI
OD;
IF is numeric THEN result ELSE -1 FI
END # TOINT # ;
 
# get the file name and number of words for the prefix and #
# max number of words and sentences from the command line #
STRING file name := "twotw.txt";
STRING start word := "";
INT prefix length := 2;
INT number of sentences := 10;
INT max words := 1 000 000;
FOR arg pos TO argc - 1 DO
STRING arg upper := argv( arg pos );
FOR ch pos FROM LWB arg upper TO UPB arg upper DO
IF is lower( arg upper[ ch pos ] ) THEN arg upper[ ch pos ] := to upper( arg upper[ ch pos ] ) FI
OD;
IF arg upper = "FILE" THEN
file name := argv( arg pos + 1 )
ELIF arg upper = "PREFIX" THEN
prefix length := TOINT argv( arg pos + 1 )
ELIF arg upper = "SENTENCES" THEN
number of sentences := TOINT argv( arg pos + 1 )
ELIF arg upper = "MAXWORDS" THEN
max words := TOINT argv( arg pos + 1 )
ELIF arg upper = "STARTWORD" THEN
start word := argv( arg pos + 1 )
FI
OD;
 
# delimiter for separating suffixes - must not appear in the text #
CHAR suffix delimiter = REPR 1; # ^A #
STRING punctuation = """'@,/;:(){}[]*&^%$£";
 
IF FILE input file;
open( input file, file name, stand in channel ) /= 0
THEN
# failed to open the file #
print( ( "Unable to open """ + file name + """", newline ) )
ELSE
# file opened OK #
BOOL at eof := FALSE;
BOOL at eol := FALSE;
# set the EOF handler for the file #
on logical file end( input file
, ( REF FILE f )BOOL:
BEGIN
# note that we reached EOF on the #
# latest read #
at eof := TRUE;
# return TRUE so processing can continue #
TRUE
END
);
# set the end-of-line handler for the file so get word can see line boundaries #
on line end( input file
, ( REF FILE f )BOOL:
BEGIN
# note we reached end-of-line #
at eol := TRUE;
# return FALSE to use the default eol handling #
# i.e. just get the next charactefr #
FALSE
END
);
CHAR c := " ";
# returns the next word from input file #
# a word is any sequence of characters separated by spaces and #
# suffix delimiters, or one of the characters ".", "!" or "?" #
PROC get word = STRING:
IF at eof THEN ""
ELSE # not at end of file #
STRING word := "";
at eol := FALSE;
IF c = "." OR c = "!" OR c = "?" THEN
# sentence ending "word" #
word := c;
get( input file, ( c ) )
ELSE
# "normal" word #
WHILE ( c = " " OR c = suffix delimiter ) AND NOT at eof DO get( input file, ( c ) ) OD;
WHILE c /= " "
AND c /= "."
AND c /= "!"
AND c /= "?"
AND c /= suffix delimiter
AND NOT at eol
AND NOT at eof
DO
word +:= c;
get( input file, ( c ) )
OD
FI;
at eol := FALSE;
word
FI # get word # ;
 
# returns a random number between 1 and n inclusive #
PROC random choice = ( INT n )INT: IF n < 2 THEN n ELSE ENTIER ( ( next random * n ) + 1 ) FI;
 
# chooses a suffix at random to continue a sentence #
PROC choose suffix = ( STRING sfxs )STRING:
BEGIN
# count the number of suffixes #
INT suffix max := 0;
FOR s pos FROM LWB sfxs TO UPB sfxs DO
IF sfxs[ s pos ] = suffix delimiter THEN suffix max +:= 1 FI
OD;
# select a random suffix to continue the text with #
STRING sfx := "";
INT prev pos := LWB sfxs - 1;
INT suffix count := random choice( suffix max );
FOR s pos FROM LWB sfxs TO UPB sfxs WHILE suffix count > 0 DO
IF sfxs[ s pos ] = suffix delimiter THEN
# found the end of a suffix #
sfx := sfxs[ prev pos + 1 : s pos - 1 @ 1 ];
prev pos := s pos;
suffix count -:= 1
FI
OD;
sfx
END # choose suffix # ;
 
# skip to the start word, if there is one #
IF start word /= "" THEN WHILE NOT at eof AND get word /= start word DO SKIP OD FI;
# get the first prefix from the file #
[ prefix length ]STRING prefix;
FOR p pos TO prefix length WHILE NOT at eof DO prefix[ p pos ] := get word OD;
IF at eof THEN
# not enough words in the file #
print( ( file name, " contains less than ", whole( prefix length, 0 ), " words", newline ) )
ELSE
# have some words #
INT word count := prefix length;
# store the prefixes and suffixes in the associatibe array #
# we store the suffix as a single concatenated #
# string delimited by suffix delimiters, the string will #
# have a leading delimiter #
# suffixes that appear multiple times in the input text will #
# appear multiple time in the array, this will allow them to #
# have a higher probability than suffixes that appear fewer #
# times #
# this will use more memory than storing the sufixes and a #
# count, but simplifies the generation #
# with a prefix length of 2 (as required by the task), #
# the War Of The Worlds can be processed - for longer prefix #
# lengths a less memory hungry algorithm would be needed #
REF AARRAY suffixes := INIT LOC AARRAY;
INT prefix count := 0;
WHILE NOT at eof AND word count <= max words
DO
# concatenate the prefix words to a single string #
STRING prefix text := prefix[ 1 ];
FOR p pos FROM 2 TO prefix length DO prefix text +:= ( " " + prefix[ p pos ] ) OD;
STRING suffix := get word;
# if the prefix has no lower case, ignore it as it is #
# probably a chapter heading or similar #
IF BOOL has lowercase := FALSE;
FOR s pos FROM LWB prefix text TO UPB prefix text
WHILE NOT ( has lowercase := is lower( prefix text[ s pos ] ) )
DO SKIP OD;
has lowercase
THEN
# the prefix contains some lower case #
# store the suffixes associated with the prefix #
IF NOT ( suffixes CONTAINSKEY prefix text ) THEN
# first time this prefix has appeared #
prefix count +:= 1
FI;
IF prefix[ 1 ] = "." OR prefix[ 1 ] = "!" OR prefix[ 1 ] = "?" THEN
# have the start of a sentence #
suffixes // "*." +:= ( suffix delimiter + prefix text )
FI;
STRING prefix without punctuation = prefix text REMOVE punctuation;
IF prefix without punctuation /= "" THEN prefix text := prefix without punctuation FI;
suffixes // prefix text +:= ( suffix delimiter + suffix )
FI;
# shuffle the prefixes down one and add the new suffix #
# as the final prefix #
FOR p pos FROM 2 TO prefix length DO prefix[ p pos - 1 ] := prefix[ p pos ] OD;
prefix[ prefix length ] := suffix;
IF NOT at eof THEN word count +:= 1 FI
OD;
 
# generate text #
TO number of sentences DO
print( ( newline ) );
# start with a random prefix #
STRING pfx := choose suffix( suffixes // "*." );
STRING line := pfx[ @ 1 ][ 3 : ]; # remove the leading #
# ". " from the line #
pfx := pfx REMOVE punctuation;
BOOL finished := FALSE;
WHILE NOT finished DO
IF STRING sfxs := ( suffixes // pfx );
IF LWB sfxs <= UPB sfxs THEN
IF sfxs[ LWB sfxs ] = suffix delimiter THEN sfxs := sfxs[ LWB sfxs + 1 : ] FI
FI;
sfxs +:= suffix delimiter;
sfxs = suffix delimiter
THEN
# no suffix - reached the end of the generated text #
line +:= " (" + pfx + " has no suffix)";
finished := TRUE
ELSE
# can continue to generate text #
STRING sfx = choose suffix( sfxs );
IF sfx = "." OR sfx = "!" OR sfx = "?"
THEN
# reached the end of a sentence #
finished := TRUE;
# if the line ends with ",;:", remove it #
INT line end := UPB line;
IF CHAR c = line[ line end ]; c = "," OR c = ";" OR c = ":" THEN
line end -:= 1
FI;
# remove trailing spaces #
WHILE line[ line end ] = " " AND line end > LWB line DO line end -:= 1 OD;
line := line[ LWB line : line end ] + sfx
ELSE
# not at the end of the sentence #
line +:= " " + sfx;
# remove the first word from the prefix and add #
# the suffix #
IF INT space pos := 0;
NOT char in string( " ", space pos, pfx )
THEN
# the prefix is only one word #
pfx := sfx
ELSE
# have multiple words #
pfx := ( pfx[ space pos + 1 : ] + " " + sfx )[ @ 1 ]
FI;
STRING pfx without punctuation = pfx REMOVE punctuation;
IF pfx without punctuation /= "" THEN pfx := pfx without punctuation FI
FI
FI
OD;
print( ( line, newline ) )
OD
FI;
close( input file )
FI
Output:

Sample output produced with the command-line:
a68g randomSentenceFromBook.a68 - FILE twotw.txt PREFIX 2 SENTENCES 10 STARTWORD cover MAXWORDS 60075
One of the sentences has been manually split over two lines.

The wine press of God that sometimes comes into the water mains near the Martians.

They said nothing to tell people until late in the back of this in the early dawn the curve of Primrose Hill.

At last as the day became excessively hot, and close, behind him, opened, and the South-Eastern and the morning sunlight.

"Are we far from Sunbury?

Since the night.

In one place but some mouldy cheese.

Then a dirty woman, carrying a baby, Gregg the butcher and his little boy, and two of them, stark and silent eloquent lips.

Unable from his window sash, and heads in every direction over the brim of which gripped a young pine trees, about the guns were waiting.

And this was the sense to keep up his son with a heavy explosion shook the air, of it first from my newspaper boy about a quarter of the heat,
    of the whole place was impassable.

Presently, he came hurrying after me he barked shortly.

Julia[edit]

""" weighted random pick of items in a Dict{String, Int} where keys are words, values counts """
function weightedrandompick(dict, total)
n = rand(1:total)
for (key, value) in dict
n -= value
if n <= 0
return key
end
end
return last(keys(dict))
end
 
let
""" Read in the book "The War of the Worlds", by H. G. Wells. """
wotw_uri = "http://www.gutenberg.org/files/36/36-0.txt"
wfile = "war_of_the_worlds.txt"
stat(wfile).size == 0 && download(wotw_uri, wfile) # download if file not here already
text = read(wfile, String)
 
"""skip to start of book and prune end """
startphrase, endphrase = "No one would have believed", "she has counted me, among the dead"
text = text[findfirst(startphrase, text).start:findlast(endphrase, text).stop]
 
""" Remove extraneous punctuation, but keep at least sentence-ending punctuation characters . ! and ? """
text = replace(replace(text, r"[^01-9a-zA-Z\.\?\!’,]" => " "), r"([.?!])" => s" \1")
words = split(text, r"\s+")
for (i, w) in enumerate(words)
w != "I" && i > 1 && words[i - 1] in [".", "?", "!"] && (words[i] = lowercase(words[i]))
end
 
""" Keep account of what words follow words and how many times it is seen. Treat sentence terminators
(?.!) as words too. Keep account of what words follow two words and how many times it is seen.
"""
follows, follows2 = Dict{String, Dict{String, Int}}(), Dict{String, Dict{String, Int}}()
afterstop, wlen = Dict{String, Int}(), length(words)
for (i, w) in enumerate(@view words[1:end-1])
d = get!(follows, w, Dict(words[i + 1] => 0))
get!(d, words[i + 1], 0)
d[words[i + 1]] += 1
if w in [".", "?", "!"]
d = get!(afterstop, words[i + 1], 0)
afterstop[words[i + 1]] += 1
end
(i > wlen - 2) && continue
w2 = w * " " * words[i + 1]
d = get!(follows2, w2, Dict(words[i + 2] => 0))
get!(d, words[i + 2], 0)
d[words[i + 2]] += 1
end
followsums = Dict(key => sum(values(follows[key])) for key in keys(follows))
follow2sums = Dict(key => sum(values(follows2[key])) for key in keys(follows2))
afterstopsum = sum(values(afterstop))
 
""" Assume that a sentence starts with a not to be shown full-stop character then use a weighted
random choice of the possible words that may follow a full-stop to add to the sentence.
"""
function makesentence()
firstword = weightedrandompick(afterstop, afterstopsum)
sentencewords = [firstword, weightedrandompick(follows[firstword], followsums[firstword])]
while !(sentencewords[end] in [".", "?", "!"])
w2 = sentencewords[end-1] * " " * sentencewords[end]
if haskey(follows2, w2)
push!(sentencewords, weightedrandompick(follows2[w2], follow2sums[w2]))
else
push!(sentencewords, weightedrandompick(afterstop, afterstopsum))
end
end
sentencewords[1] = uppercase(firstword[1]) * (length(firstword) > 1 ? firstword[2:end] : "")
println(join(sentencewords[1:end-1], " ") * sentencewords[end] * "\n")
end
# Print 3 weighted random pick sentences
makesentence(); makesentence(); makesentence()
end
 
Output:
(RUN:)

It may be lying dead there!

I can imagine them covered with smashed windows and saw the flashes of flame flashed up
and saw through a culvert.

I remember how mockingly bright the sky was still doubtful it rapped smartly against the
starlight from the sun blazed dazzling in a flash I was beginning to face these things
but later I perceived a hold on me and rapidly growing hotter.

(RUN:)

It was this cylinder.

Ogilvy watched till one, and they say there’s been guns heard at Chertsey, heavy firing,
and that every other man still wore his dirty rags.

My companion had been enlarged, and ever!

(RUN:)

Survivors on castle hill alive but helplessly and speechlessly drunk.

Before they were killed.

The landlord should leave his.

(RUN:)

And a cheer that seemed so happy and bright.

Once down one of the tangled maze of streets would have questioned my intellectual
superiority to his feet and had been in active service and he turned to see Lord Hilton,
the lord of the parapet.

What has happened?

Nim[edit]

Inspired by Julia solution, but not a translation actually.

import random, sequtils, strutils, tables
from unicode import utf8
 
const StopChars = [".", "?", "!"]
 
proc weightedChoice(choices: CountTable[string]; totalCount: int): string =
## Return a string from "choices" key using counts as weights.
var n = rand(1..totalCount)
for word, count in choices.pairs:
dec n, count
if n <= 0: return word
assert false, "internal error"
 
proc finalFilter(words: seq[string]): seq[string] =
## Eliminate words of length one (except those of a given list)
## and words containing only uppercase letters (words from titles).
for word in words:
if word in [".", "?", "!", "I", "A", "a"]:
result.add word
elif word.len > 1 and any(word, isLowerAscii):
result.add word
 
 
randomize()
 
var text = readFile("The War of the Worlds.txt")
 
# Extract the actual text from the book.
const
StartText = "BOOK ONE\r\nTHE COMING OF THE MARTIANS"
EndText = "End of the Project Gutenberg EBook"
let startIndex = text.find(StartText)
let endIndex = text.find(EndText)
text = text[startIndex..<endIndex]
 
# Clean the text by removing some characters and replacing others.
# As there are some non ASCII characters, we have to apply special rules.
var processedText: string
for uchar in text.utf8():
if uchar.len == 1:
# ASCII character.
let ch = uchar[0]
case ch
of '0'..'9', 'a'..'z', 'A'..'Z', ' ': processedText.add ch # Keep as is.
of '\n': processedText.add ' ' # Replace with a space.
of '.', '?', '!': processedText.add ' ' & ch # Make sure these characters are isolated.
else: discard # Remove others.
else:
# Some UTF-8 representation of a non ASCII character.
case uchar
of "—": processedText.add ' ' # Replace EM DASH with space.
of "ç", "æ", "’": processedText.add uchar # Keep these characters as they are parts of words.
of "“", "”", "‘": discard # Removed these ones.
else: echo "encountered: ", uchar # Should not go here.
 
# Extract words and filter them.
let words = processedText.splitWhitespace().finalFilter()
 
# Build count tables.
var followCount, followCount2: Table[string, CountTable[string]]
for i in 1..words.high:
followCount.mgetOrPut(words[i - 1], initCountTable[string]()).inc(words[i])
for i in 2..words.high:
followCount2.mgetOrPut(words[i - 2] & ' ' & words[i - 1], initCountTable[string]()).inc words[i]
 
# Build sum tables.
var followSum, followSum2: CountTable[string]
for key in followCount.keys:
for count in followCount[key].values:
followSum.inc key, count
for key in followCount2.keys:
for count in followCount2[key].values:
followSum2.inc key, count
 
# Build table of starting words and compute the sum.
var
startingWords: CountTable[string]
startingSum: int
for stopChar in StopChars:
for word, count in followCount[stopChar].pairs:
startingWords.inc word, count
inc startingSum, count
 
# Build a sentence.
let firstWord = weightedChoice(startingWords, startingSum)
var sentence = @[firstWord]
var lastWord = weightedChoice(followCount[firstWord], followSum[firstWord])
while lastWord notin StopChars:
sentence.add lastWord
let key = sentence[^2] & ' ' & lastWord
lastWord = if key in followCount2:
weightedChoice(followCount2[key], followSum2[key])
else:
weightedChoice(followCount[lastWord], followSum[lastWord])
echo sentence.join(" ") & lastWord
Output:

Here are some generated sentences. Short sentences are of course more likely to have a meaning.

But they won’t hunt us.
There was a whole population in movement.
The one had closed it.
Then he dropped his spade.
It’s out on the London valley.
I narrowly escaped an accident.
I assert that I had immediately to turn my attention first.
Halfway through the deserted village while the Martian approach.
I have no doubt they are mad with terror.
He told me no answer to that.
That’s how we shall save the race.
As if it had driven blindly straight at the group of soldiers to protect these strange creatures from Mars?
In the afternoon for the next day there was a carriage crashed into the parlour behind the engines going northward along the road.

Perl[edit]

#!/usr/bin/perl
 
use strict; # https://rosettacode.org/wiki/Random_sentence_from_book
use warnings;
 
my $book = do { local (@ARGV, $/) = 'waroftheworlds.txt'; <> };
my (%one, %two);
 
s/^.*?START OF THIS\N*\n//s, s/END OF THIS.*//s,
tr/a-zA-Z.!?/ /c, tr/ / /s for $book;
 
my $qr = qr/(\b\w+\b|[.!?])/;
$one{$1}{$2}++, $two{$1}{$2}{$3}++ while $book =~ /$qr(?= *$qr *$qr)/g;
 
sub weightedpick
{
my $href = shift;
my @weightedpick = map { ($_) x $href->{$_} } keys %$href;
$weightedpick[rand @weightedpick];
}
 
sub sentence
{
my @sentence = qw( . ! ? )[rand 3];
push @sentence, weightedpick( $one{ $sentence[0] } );
push @sentence, weightedpick( $two{ $sentence[-2] }{ $sentence[-1] } )
while $sentence[-1] =~ /\w/;
shift @sentence;
"@sentence\n\n" =~ s/\w\K (?=[st]\b)/'/gr =~ s/ (?=[.!?]\n)//r
=~ s/.{60,}?\K /\n/gr;
}
 
print sentence() for 1 .. 10;
Output:
The Kingston and Richmond defences forced!

I heard a scream under the seat upon which their systems were
unprepared slain as the impact of trucks the sharp whistle of
the lane my brother for the clinking of the dying man in a flash
of lightning saw between my feet to the post office a little
note in the City with the last man left alive.

I said and a remote weird crying.

said the woman over the bridges in its arrival.

In a few paces stagger and go with him all that it was to be
answered faintly.

Quite enough said the lieutenant.

I assented.

Eh?

The houses seemed deserted.


Phix[edit]

Library: Phix/libcurl
-- demo/rosetta/RandomSentence.exw
include builtins\libcurl.e
constant url =  "http://www.gutenberg.org/files/36/36-0.txt",
         filename = "war_of_the_worlds.txt",
         fsent = "No one would have believed",
         lasts = "End of the Project Gutenberg EBook",
         unicodes = {utf32_to_utf8({#2019}),    -- rsquo
                     utf32_to_utf8({#2014})},   -- hyphen
         asciis = {"'","-"},
         aleph = tagset('Z','A')&tagset('z','a')&tagset('9','0')&",'.?! ",
         follow = new_dict(),   -- {word}      -> {words,counts}
         follow2 = new_dict()   -- {word,word} -> {words,counts}
 
if not file_exists(filename) then
    printf(1,"Downloading %s...\n",{filename})
    CURLcode res = curl_easy_get_file(url,"",filename)
    if res!=CURLE_OK then crash("cannot download") end if
end if
string text = get_text(filename)
text = text[match(fsent,text)..match(lasts,text)-1]
text = substitute_all(text,unicodes,asciis)
text = substitute_all(text,".?!-\n",{" ."," ? "," ! "," "," "})
text = filter(text,"in",aleph)
sequence words = split(text)

procedure account(sequence words)
    string next = words[$]
    words = words[1..$-1]
    for i=length(words) to 1 by -1 do
        integer d = {follow,follow2}[i]
        sequence t = getdd(words,{{},{}},d)
        integer tk = find(next,t[1])
        if tk=0 then
            t[1] = append(t[1],next)
            t[2] = append(t[2],1)
        else
            t[2][tk] += 1
        end if
        setd(words,t,d)
        words = words[2..$]
        if words!={"."} then exit end if -- (may as well quit)
    end for
end procedure

for i=2 to length(words) do
    if find(words[i],{".","?","!"})
    and i<length(words) then
        words[i+1] = lower(words[i+1])
    end if
    account(words[max(1,i-2)..i])
end for

function weighted_random_pick(sequence words, integer dict)
    sequence t = getd(words,dict)
    integer total = sum(t[2]),
            r = rand(total)
    for i=1 to length(t[2]) do
        r -= t[2][i]
        if r<=0 then
            return t[1][i]
        end if
    end for
end function

for i=1 to 5 do
    sequence sentence = {".",weighted_random_pick({"."},follow)}
    while true do
        string next = weighted_random_pick(sentence[-2..-1],follow2)
        sentence = append(sentence,next)
        if find(next,{".","?","!"}) then exit end if
    end while
    sentence[2][1] = upper(sentence[2][1])
    printf(1,"%s\n",{join(sentence[2..$-1])&sentence[$]})
end for
{} = wait_key()
Output:
With one another by means of a speck of blight, and apparently strengthened the walls of the spectators had gathered in one cart stood a blind man in the direction of Chobham.
I fell and lay about our feet.
And we were driving down Maybury Hill.
It was with the arms of an engine.
Now we see further.

Python[edit]

Extended to preserve some extra "sentence pausing" characters and try and tidy-up apostrophes.

from urllib.request import urlopen
import re
from string import punctuation
from collections import Counter, defaultdict
import random
 
 
# The War of the Worlds, by H. G. Wells
text_url = 'http://www.gutenberg.org/files/36/36-0.txt'
text_start = 'No one would have believed'
 
sentence_ending = '.!?'
sentence_pausing = ',;:'
 
def read_book(text_url, text_start) -> str:
with urlopen(text_url) as book:
text = book.read().decode('utf-8')
return text[text.index(text_start):]
 
def remove_punctuation(text: str, keep=sentence_ending+sentence_pausing)-> str:
"Remove punctuation, keeping some"
to_remove = ''.join(set(punctuation) - set(keep))
text = text.translate(str.maketrans(to_remove, ' ' * len(to_remove))).strip()
text = re.sub(fr"[^a-zA-Z0-9{keep}\n ]+", ' ', text)
# Remove duplicates and put space around remaining punctuation
if keep:
text = re.sub(f"([{keep}])+", r" \1 ", text).strip()
if text[-1] not in sentence_ending:
text += ' .'
return text.lower()
 
def word_follows_words(txt_with_pauses_and_endings):
"return dict of freq of words following one/two words"
words = ['.'] + txt_with_pauses_and_endings.strip().split()
 
# count of what word follows this
word2next = defaultdict(lambda :defaultdict(int))
word2next2 = defaultdict(lambda :defaultdict(int))
for lh, rh in zip(words, words[1:]):
word2next[lh][rh] += 1
for lh, mid, rh in zip(words, words[1:], words[2:]):
word2next2[(lh, mid)][rh] += 1
 
return dict(word2next), dict(word2next2)
 
def gen_sentence(word2next, word2next2) -> str:
 
s = ['.']
s += random.choices(*zip(*word2next[s[-1]].items()))
while True:
s += random.choices(*zip(*word2next2[(s[-2], s[-1])].items()))
if s[-1] in sentence_ending:
break
 
s = ' '.join(s[1:]).capitalize()
s = re.sub(fr" ([{sentence_ending+sentence_pausing}])", r'\1', s)
s = re.sub(r" re\b", "'re", s)
s = re.sub(r" s\b", "'s", s)
s = re.sub(r"\bi\b", "I", s)
 
return s
 
if __name__ == "__main__":
txt_with_pauses_and_endings = remove_punctuation(read_book(text_url, text_start))
word2next, word2next2 = word_follows_words(txt_with_pauses_and_endings)
#%%
sentence = gen_sentence(word2next, word2next2)
print(sentence)
Output:
<# A SAMPLE OF GENERATED SENTENCES

As I stood petrified and staring down the river, over which spread a multitude of dogs, I flung myself forward under the night sky, a sky of gold.

He was walking through the gaps in the water.

There was no place to their intelligence, without a word they were in position there.

Ugh!

The ringing impact of trucks, the person or entity that provided you with the torrent to recover it.

Raku[edit]

Started out as translation of Perl, but diverged.

my $text = '36-0.txt'.IO.slurp.subst(/.+ '*** START OF THIS' .+? \n (.*?) 'End of the Project Gutenberg EBook' .*/, {$0} );
 
$text.=subst(/ <+punct-[.!?\’,]> /, ' ', :g);
$text.=subst(/ (\s) '’' (\s) /, '', :g);
$text.=subst(/ (\w) '’' (\s) /, {$0~$1}, :g);
$text.=subst(/ (\s) '’' (\w) /, {$0~$1}, :g);
 
my (%one, %two);
 
for $text.comb(/[\w+(\’\w+)?]','?|<punct>/).rotor(3 => -2) {
%two{.[0]}{.[1]}{.[2]}++;
%one{.[0]}{.[1]}++;
}
 
sub weightedpick (%hash) { %hash.keys.map( { $_ xx %hash{$_} } ).pick }
 
sub sentence {
my @sentence = <. ! ?>.roll;
@sentence.push: weightedpick( %one{ @sentence[0] } );
@sentence.push: weightedpick( %two{ @sentence[*-2] }{ @sentence[*-1] } // %('.' => 1) )[0]
until @sentence[*-1]<. ! ?>;
@sentence.=squish;
shift @sentence;
redo if @sentence < 7;
@sentence.join(' ').tc.subst(/\s(<:punct>)/, {$0}, :g);
}
 
say sentence() ~ "\n" for ^10;
Sample output:
To the inhabitants calling itself the Committee of Public Supply seized the opportunity of slightly shifting my position, which had caused a silent mass of smoke rose slanting and barred the face.

Why was I after the Martian within the case, but that these monsters.

As if hell was built for rabbits!

Thenks and all that the Secret of Flying, was discovered.

Or did a Martian standing sentinel I suppose the time we drew near the railway officials connected the breakdown with the butt.

Flutter, flutter, went the bats, heeding it not been for the big table like end of it a great light was seen by the humblest things that God, in his jaws coming headlong towards me, and rapidly growing hotter.

Survivors there were no longer venturing into the side roads of the planet in view.

Just as they began playing at touch in and out into the west, but nothing to me the landscape, weird and vague and strange and incomprehensible that for the wet.

Just as they seem to remember talking, wanderingly, to myself for an accident, but the captain lay off the platforms, and my wife to their former stare, and his lower limbs lay limp and dead horses.

Entrails they had fought across to the post office a little one roomed squatter’s hut of wood, surrounded by a gradual development of brain and hands, the latter giving rise to the corner.

Wren[edit]

Library: Wren-seq
import "io" for File
import "random" for Random
import "/seq" for Lst
 
// puctuation to keep (also keep hyphen and apostrophe but don't count as words)
var ending = ".!?"
var pausing = ",:;"
 
// puctuation to remove
var removing = "\"#$\%&()*+/<=>@[\\]^_`{|}~“”"
 
// read in book
var fileName = "36-0.txt" // local copy of http://www.gutenberg.org/files/36/36-0.txt
var text = File.read(fileName)
 
// skip to start
var ix = text.indexOf("No one would have believed")
text = text[ix..-1]
 
// remove extraneous punctuation
for (r in removing) text = text.replace(r, "")
 
// replace EM DASH (unicode 8212) with a space
text = text.replace("—", " ")
 
// split into words
var words = text.split(" ").where { |w| w != "" }.toList
// treat 'ending' and 'pausing' punctuation as words
for (i in 0...words.count) {
var w = words[i]
for (p in ending + pausing) if (w.endsWith(p)) words[i] = [w[0...-1], w[-1]]
}
words = Lst.flatten(words)
 
// Keep account of what words follow words and how many times it is seen
var dict1 = {}
for (i in 0...words.count-1) {
var w1 = words[i]
var w2 = words[i+1]
if (dict1[w1]) {
dict1[w1].add(w2)
} else {
dict1[w1] = [w2]
}
}
for (key in dict1.keys) dict1[key] = [dict1[key].count, Lst.individuals(dict1[key])]
 
// Keep account of what words follow two words and how many times it is seen
var dict2 = {}
for (i in 0...words.count-2) {
var w12 = words[i] + " " + words[i+1]
var w3 = words[i+2]
if (dict2[w12]) {
dict2[w12].add(w3)
} else {
dict2[w12] = [w3]
}
}
for (key in dict2.keys) dict2[key] = [dict2[key].count, Lst.individuals(dict2[key])]
 
var rand = Random.new()
 
var weightedRandomChoice = Fn.new { |value|
var n = value[0]
var indivs = value[1]
var r = rand.int(n)
var sum = 0
for (indiv in indivs) {
sum = sum + indiv[1]
if (r < sum) return indiv[0]
}
}
 
// build 5 random sentences say
for (i in 1..5) {
var sentence = weightedRandomChoice.call(dict1["."])
var lastOne = sentence
var lastTwo = ". " + sentence
while (true) {
var nextOne = weightedRandomChoice.call(dict2[lastTwo])
sentence = sentence + " " + nextOne
if (ending.contains(nextOne)) break // stop on reaching ending punctuation
lastTwo = lastOne + " " + nextOne
lastOne = nextOne
}
 
// tidy up sentence
for (p in ending + pausing) sentence = sentence.replace(" %(p)", "%(p)")
sentence = sentence.replace("\n", " ")
System.print(sentence)
System.print()
}
Output:

Sample run:

In another second it had come into my mind.

He stopped behind to tell the neighbours.

A woman screamed.

Woe unto this unfaithful city!

As Mars approached opposition, Lavelle of Java set the wires of the afternoon.