Execute a Markov algorithm: Difference between revisions
(→{{header|Python}}: Add the last stretch goal to the assertions.) |
(→{{header|C++}}: Make it more structured.) |
||
Line 271: | Line 271: | ||
#include <vector> |
#include <vector> |
||
#include <string> |
#include <string> |
||
struct rule |
struct rule |
||
{ |
{ |
||
Line 284: | Line 284: | ||
} |
} |
||
}; |
}; |
||
std::string const whitespace = " \t"; |
std::string const whitespace = " \t"; |
||
std::string::size_type const npos = std::string::npos; |
std::string::size_type const npos = std::string::npos; |
||
bool is_whitespace(char c) |
bool is_whitespace(char c) |
||
{ |
{ |
||
Line 293: | Line 293: | ||
} |
} |
||
std::vector<rule> read_rules(std::ifstream& rulefile) |
|||
⚫ | |||
{ |
{ |
||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
std::vector<rule> rules; |
std::vector<rule> rules; |
||
std::string line; |
std::string line; |
||
Line 307: | Line 300: | ||
{ |
{ |
||
std::string::size_type pos; |
std::string::size_type pos; |
||
// remove comments |
// remove comments |
||
pos = line.find('#'); |
pos = line.find('#'); |
||
if (pos != npos) |
if (pos != npos) |
||
line.resize(pos); |
line.resize(pos); |
||
// ignore lines consisting only of whitespace |
// ignore lines consisting only of whitespace |
||
if (line.find_first_not_of(whitespace) == npos) |
if (line.find_first_not_of(whitespace) == npos) |
||
continue; |
continue; |
||
// find "->" surrounded by whitespace |
// find "->" surrounded by whitespace |
||
pos = line.find("->"); |
pos = line.find("->"); |
||
while (pos != npos && (pos == 0 || !is_whitespace(line[pos-1]))) |
while (pos != npos && (pos == 0 || !is_whitespace(line[pos-1]))) |
||
pos = line.find("->", pos+1); |
pos = line.find("->", pos+1); |
||
if (pos == npos || line.length() < pos+3 || !is_whitespace(line[pos+2])) |
if (pos == npos || line.length() < pos+3 || !is_whitespace(line[pos+2])) |
||
{ |
{ |
||
Line 327: | Line 320: | ||
return EXIT_FAILURE; |
return EXIT_FAILURE; |
||
} |
} |
||
std::string pattern = line.substr(0, pos-1); |
std::string pattern = line.substr(0, pos-1); |
||
std::string replacement = line.substr(pos+3); |
std::string replacement = line.substr(pos+3); |
||
// remove additional separating whitespace |
// remove additional separating whitespace |
||
pattern.erase(pattern.find_last_not_of(whitespace)+1); |
pattern.erase(pattern.find_last_not_of(whitespace)+1); |
||
replacement.erase(0, replacement.find_first_not_of(whitespace)); |
replacement.erase(0, replacement.find_first_not_of(whitespace)); |
||
// test for terminal rule |
// test for terminal rule |
||
bool terminal = !replacement.empty() && replacement[0] == '.'; |
bool terminal = !replacement.empty() && replacement[0] == '.'; |
||
if (terminal) |
if (terminal) |
||
replacement.erase(0,1); |
replacement.erase(0,1); |
||
rules.push_back(rule(pattern, replacement, terminal)); |
rules.push_back(rule(pattern, replacement, terminal)); |
||
} |
} |
||
return rules; |
|||
⚫ | |||
} |
|||
std::string markov(std::vector<rule> rules, std::string input) |
|||
{ |
|||
std::string& output = input; |
|||
std::vector<rule>::iterator iter = rules.begin(); |
std::vector<rule>::iterator iter = rules.begin(); |
||
// Loop through each rule, transforming our current version |
|||
// with each rule. |
|||
while (iter != rules.end()) |
while (iter != rules.end()) |
||
{ |
{ |
||
std::string::size_type pos = |
std::string::size_type pos = output.find(iter->pattern); |
||
if (pos != npos) |
if (pos != npos) |
||
{ |
{ |
||
output.replace(pos, iter->pattern.length(), iter->replacement); |
|||
if (iter->terminal) |
if (iter->terminal) |
||
break; |
break; |
||
Line 358: | Line 359: | ||
} |
} |
||
return output; |
|||
⚫ | |||
} |
} |
||
⚫ | |||
{ |
|||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
std::vector<rule> rules = read_rules(rulefile); |
|||
⚫ | |||
std::string output = markov(rules, input); |
|||
⚫ | |||
} |
|||
</lang> |
</lang> |
||
Revision as of 19:52, 17 December 2009
This page uses content from Wikipedia. The original article was at Markov_algorithm. The list of authors can be seen in the page history. As with Rosetta Code, the text of Wikipedia is available under the GNU FDL. (See links for details on variance) |
You are encouraged to solve this task according to the task description, using any language you may know.
Create an interpreter for a Markov Algorithm. Rules have the syntax:
<ruleset> ::= ((<comment> | <rule>) <newline>+)* <comment> ::= # {<any character>} <rule> ::= <pattern> <whitespace> -> <whitespace> [.] <replacement> <whitespace> ::= (<tab> | <space>) [<whitespace>]
There is one rule per line. If there is a . present before the <replacement>, then this is a terminating rule in which case the interpreter must halt execution. A ruleset consists of a sequence of rules, with optional comments.
In order to promote flexibility, the interpreter should load the set of rules from one file, take the string to operate on from a second file, and write the output to a third.
Use the following three tests on entries:
Ruleset 1:
# This rules file is extracted from Wikipedia: # http://en.wikipedia.org/wiki/Markov_Algorithm A -> apple B -> bag S -> shop T -> the the shop -> my brother a never used -> .terminating rule
Sample text of:
I bought a B of As from T S.
Should generate the output:
I bought a bag of apples from my brother.
Ruleset 2:
A test of the terminating rule
# Slightly modified from the rules on Wikipedia A -> apple B -> bag S -> .shop T -> the the shop -> my brother a never used -> .terminating rule
Sample text of:
I bought a B of As from T S.
Should generate:
I bought a bag of apples from T shop.
Ruleset 3:
A stretch goal. This tests for correct substitution order and may trap simple regexp based replacement routines if special regexp characters are not escaped.
# BNF Syntax testing rules A -> apple WWWW -> with Bgage -> ->.* B -> bag ->.* -> money W -> WW S -> .shop T -> the the shop -> my brother a never used -> .terminating rule
Sample text of:
I bought a B of As W my Bgage from T S.
Should generate:
I bought a bag of apples with my money from T shop.
Ruleset 4:
A stretch goal. This tests for correct order of scanning of rules, and may trap replacement routines that scan in the wrong order. It implements a general unary multiplication engine. (Note that the input expression must be placed within underscores in this implementation.)
### Unary Multiplication Engine, for testing Markov Algorithm implementations ### By Donal Fellows. # Unary addition engine _+1 -> _1+ 1+1 -> 11+ # Pass for converting from the splitting of multiplication into ordinary # addition 1! -> !1 ,! -> !+ _! -> _ # Unary multiplication by duplicating left side, right side times 1*1 -> x,@y 1x -> xX X, -> 1,1 X1 -> 1X _x -> _X ,x -> ,X y1 -> 1y y_ -> _ # Next phase of applying 1@1 -> x,@y 1@_ -> @_ ,@_ -> !_ ++ -> + # Termination cleanup for addition _1 -> 1 1+_ -> 1 _+_ ->
Sample text of:
_1111*11111_
should generate the output:
11111111111111111111
C
<lang c>#include <stdio.h>
- include <stdlib.h>
- include <stdbool.h>
- include <string.h>
- include <assert.h>
- define MAX_RULE_LEN 1024
- define MAX_STR_LEN 1024
typedef struct rulestruct {
char *trigger; char *replacement; bool terminal; struct rulestruct *next;
} rule_t;
rule_t *free_rule(rule_t *r)
{
if ( r == NULL ) return NULL; if ( r->trigger != NULL ) free(r->trigger); if ( r->replacement != NULL ) free(r->replacement); rule_t *next = r->next; free(r); return next;
}
void free_rulelist(rule_t *head) {
rule_t *n = head; while( n != NULL ) n = free_rule(n);
}
void readrules(FILE *f, rule_t **ruleset) {
char buffer[MAX_RULE_LEN]; rule_t *t, *prev; int i, j; size_t l; *ruleset = prev = NULL; for(l=1; fgets(buffer, MAX_RULE_LEN, f) != NULL; l++ ) { if ( buffer[0] == '#' ) continue; // not a rule but a comment t = malloc(sizeof(rule_t)); assert( t != NULL ); memset(t, 0, sizeof(rule_t)); // just to be sure, in case of failure, to avoid // freeing unallocated memory // skip blank lines (there cannot be leading spaces...!) if ( (buffer[0] == '\n') || (buffer[0] == '\r') ) continue; // it's a rule: let's move until the first " -> " char *map = strstr(buffer, " -> "); if ( map == NULL ) { fprintf(stderr, "rule set syntax error line %d\n", l); free_rule(t); return; } i = map - buffer + 4; // skip " -> " j = map - buffer - 1; while( (buffer[j] == ' ') || (buffer[j] == '\t') ) j--; buffer[j+1] = 0; t->trigger = strdup(buffer); assert( t->trigger != NULL ); //skip whitespaces after -> for( ; (buffer[i] == '\t') || (buffer[i] == ' '); i++) ; if ( buffer[i] == '.' ) { t->terminal = true; i++; // terminal rule } else { t->terminal = false; // or not } j = i; // store this position and let's find the end i += strlen(buffer+j); for( i--; (buffer[i] == '\n') || (buffer[i] == '\r') ; i--) ; buffer[i+1] = 0; t->replacement = strdup(buffer+j); assert(t->replacement != NULL); if ( prev == NULL ) { *ruleset = t; } else { prev->next = t; } prev = t; }
}
// each line of the file is a "string" void markov(FILE *f, rule_t *rule) {
char buffer[2][MAX_STR_LEN]; // double to allow state changing and no overlapping int bi; rule_t *r; char *p, *d, *bp; bool repldone; size_t s;
while( ( fgets(buffer[0], MAX_STR_LEN, f) != NULL ) ) { bi = 0;
do { repldone = false; for( r = rule; r != NULL; r = r->next, bi++) {
bp = buffer[bi%2]; d = buffer[(bi+1)%2]; if ( (p = strstr(bp, r->trigger)) != NULL ) { s = p - bp; memcpy(d, bp, s); strcpy(d + s, r->replacement); strcpy(d + strlen(r->replacement) + s, bp + strlen(r->trigger) + s); if ( r->terminal ) { repldone = false; bi++; // let be bi the current (last) buffer break; } repldone = true; // a repl. was done r = rule; // since a repl. was done, let's "reset" r } else { bi--; // stay on the same buffer }
} } while( repldone ); } puts(buffer[(bi)%2]);
}
int main(int argc, char **argv) {
FILE *rulefile_h = NULL; FILE *stringfile_h = NULL; rule_t *rulelist;
if ( argc < 3 ) { printf("Usage: %s rulefile stringfile\n", argv[0]); exit(EXIT_FAILURE); } rulefile_h = fopen(argv[1], "r"); assert( rulefile_h != NULL ); stringfile_h = fopen(argv[2], "r"); assert( stringfile_h != NULL );
readrules(rulefile_h, &rulelist); assert( rulelist != NULL ); markov(stringfile_h, rulelist);
// dump rules
/*
rule_t *h = rulelist; while( h != NULL ) { printf("%s -> %s%s\n", h->trigger, h->replacement, h->terminal ? " [TERMINATING RULE]" : ""); h = h->next; }
- /
free_rulelist(rulelist);
fclose(rulefile_h); fclose(stringfile_h);
return EXIT_SUCCESS;
}</lang>
C++
Note: Non-use of iswhite
is intentional, since depending on the locale, other chars besides space and tab might be detected by that function.
<lang cpp>
- include <cstdlib>
- include <iostream>
- include <fstream>
- include <vector>
- include <string>
struct rule {
std::string pattern; std::string replacement; bool terminal; rule(std::string pat, std::string rep, bool term): pattern(pat), replacement(rep), terminal(term) { }
};
std::string const whitespace = " \t"; std::string::size_type const npos = std::string::npos;
bool is_whitespace(char c) {
return whitespace.find(c) != npos;
}
std::vector<rule> read_rules(std::ifstream& rulefile) {
std::vector<rule> rules; std::string line; while (std::getline(rulefile, line)) { std::string::size_type pos; // remove comments pos = line.find('#'); if (pos != npos) line.resize(pos); // ignore lines consisting only of whitespace if (line.find_first_not_of(whitespace) == npos) continue; // find "->" surrounded by whitespace pos = line.find("->"); while (pos != npos && (pos == 0 || !is_whitespace(line[pos-1]))) pos = line.find("->", pos+1); if (pos == npos || line.length() < pos+3 || !is_whitespace(line[pos+2])) { std::cerr << "invalid rule: " << line << "\n"; return EXIT_FAILURE; } std::string pattern = line.substr(0, pos-1); std::string replacement = line.substr(pos+3); // remove additional separating whitespace pattern.erase(pattern.find_last_not_of(whitespace)+1); replacement.erase(0, replacement.find_first_not_of(whitespace)); // test for terminal rule bool terminal = !replacement.empty() && replacement[0] == '.'; if (terminal) replacement.erase(0,1); rules.push_back(rule(pattern, replacement, terminal)); }
return rules;
}
std::string markov(std::vector<rule> rules, std::string input) {
std::string& output = input; std::vector<rule>::iterator iter = rules.begin();
// Loop through each rule, transforming our current version // with each rule. while (iter != rules.end()) { std::string::size_type pos = output.find(iter->pattern); if (pos != npos) { output.replace(pos, iter->pattern.length(), iter->replacement); if (iter->terminal) break; iter = rules.begin(); } ++iter; }
return output;
}
int main(int argc, char* argv[]) {
if (argc != 3) { std::cout << "usage:\n " << argv[0] << " rulefile text\n"; return EXIT_FAILURE; } std::ifstream rulefile(argv[1]); std::vector<rule> rules = read_rules(rulefile);
std::string input(argv[2]); std::string output = markov(rules, input);
std::cout << output << "\n";
}
</lang>
Haskell
This program expects a source file as an argument and uses the standard input and output devices for the algorithm's I/O.
<lang haskell>import Data.List (isPrefixOf) import Data.Maybe (catMaybes) import Control.Monad import Text.ParserCombinators.Parsec import System.IO import System.Environment (getArgs)
main = do
args <- getArgs unless (length args == 1) $ fail "Please provide exactly one source file as an argument." let sourcePath = head args source <- readFile sourcePath input <- getContents case parse markovParser sourcePath source of Right rules -> putStrLn $ runMarkov rules input Left err -> hPutStrLn stderr $ "Parse error at " ++ show err
data Rule = Rule
{from :: String, terminating :: Bool, to :: String}
markovParser :: Parser [Rule] markovParser = liftM catMaybes $
(comment <|> rule) `sepEndBy` many1 newline where comment = char '#' >> skipMany nonnl >> return Nothing rule = liftM Just $ liftM3 Rule (manyTill (nonnl <?> "pattern character") $ try arrow) (succeeds $ char '.') (many nonnl) arrow = ws >> string "->" >> ws <?> "whitespace-delimited arrow" nonnl = noneOf "\n" ws = many1 $ oneOf " \t" succeeds p = option False $ p >> return True
runMarkov :: [Rule] -> String -> String runMarkov rules s = f rules s
where f [] s = s f (Rule from terminating to : rs) s = g "" s where g _ "" = f rs s g before ahead@(a : as) = if from `isPrefixOf` ahead then let new = reverse before ++ to ++ drop (length from) ahead in if terminating then new else f rules new else g (a : before) as</lang>
J
Solution:<lang j>require'strings regex'
markovLexer =: verb define
rules =. LF cut TAB&=`(,:&' ')}y rules =. a: -.~ (dltb@:{.~ i:&'#')&.> rules rules =. 0 _1 {"1 '\s+->\s+' (rxmatch rxcut ])S:0 rules (,. ] (}.&.>~ ,. ]) ('.'={.)&.>)/ |: rules
)
replace =: dyad define
'index patternLength replacement'=. x 'head tail' =. index split y head, replacement, patternLength }. tail
)
matches =: E. i. 1:
markov =: dyad define
ruleIdx =. 0 [ rules =. markovLexer x while. ruleIdx < #rules do. 'pattern replacement terminating' =. ruleIdx { rules ruleIdx =. 1 + ruleIdx if. (#y) > index =. pattern matches y do. y =. (index ; (#pattern) ; replacement) replace y ruleIdx =. _ * terminating end. end. y
)</lang>
Example:<lang j> m1 =. noun define # This rules file is extracted from Wikipedia: # http://en.wikipedia.org/wiki/Markov_Algorithm A -> apple B -> bag S -> shop T -> the the shop -> my brother a never used -> .terminating rule )
m1 markov 'I bought a B of As from T S.'
I bought a bag of apples from my brother. </lang> Discussion: The J implementation correctly processes all the rulesets, including the stretch goals. More details are available on the the talk page.
Perl
This program expects a source file as an argument and uses the standard input and output devices for the algorithm's I/O.
<lang perl>@ARGV == 1 or die "Please provide exactly one source file as an argument.\n"; open my $source, '<', $ARGV[0] or die qq[I couldn't open "$ARGV[0]" for reading. ($!.)\n]; my @rules; while (<$source>)
{/\A#/ and next; my @a = /(.*?)\s+->\s+(\.?)(.*)/ or die "Syntax error: $_"; push @rules, \@a;}
close $source;
my $input = do {local $/; <>;};
OUTER:
{foreach (@rules) {my ($from, $terminating, $to) = @$_; $input =~ s/\Q$from\E/$to/ and ($terminating ? last OUTER : redo OUTER);}}
print $input;</lang>
Python
The example uses a regexp to parse the syntax of the grammar. This regexp is multi-line and verbose, and uses named groups to aid in understanding the regexp and to allow more meaningful group names to be used when extracting the replacement data from the grammars in function extractreplacements
.
<lang python>import re
syntaxre = r"""(?mx) ^(?:
(?: (?P<comment> \# .* ) ) | (?: (?P<blank> \s* ) (?: \n | $ ) ) | (?: (?P<rule> (?P<pat> .+? ) \s+ -> \s+ (?P<term> \.)? (?P<repl> .+) ) )
)$ """
grammar1 = """\
- This rules file is extracted from Wikipedia:
- http://en.wikipedia.org/wiki/Markov_Algorithm
A -> apple B -> bag S -> shop T -> the the shop -> my brother a never used -> .terminating rule """
grammar2 = \
- Slightly modified from the rules on Wikipedia
A -> apple B -> bag S -> .shop T -> the the shop -> my brother a never used -> .terminating rule
grammar3 = \
- BNF Syntax testing rules
A -> apple WWWW -> with Bgage -> ->.* B -> bag ->.* -> money W -> WW S -> .shop T -> the the shop -> my brother a never used -> .terminating rule
grammar4 = \
- Unary Multiplication Engine, for testing Markov Algorithm implementations
- By Donal Fellows.
- Unary addition engine
_+1 -> _1+ 1+1 -> 11+
- Pass for converting from the splitting of multiplication into ordinary
- addition
1! -> !1 ,! -> !+ _! -> _
- Unary multiplication by duplicating left side, right side times
1*1 -> x,@y 1x -> xX X, -> 1,1 X1 -> 1X _x -> _X ,x -> ,X y1 -> 1y y_ -> _
- Next phase of applying
1@1 -> x,@y 1@_ -> @_ ,@_ -> !_ ++ -> +
- Termination cleanup for addition
_1 -> 1 1+_ -> 1 _+_ ->
text1 = "I bought a B of As from T S."
text2 = "I bought a B of As W my Bgage from T S."
text3 = '_1111*11111_'
def extractreplacements(grammar):
return [ (matchobj.group('pat'), matchobj.group('repl'), bool(matchobj.group('term'))) for matchobj in re.finditer(syntaxre, grammar) if matchobj.group('rule')]
def replace(text, replacements):
while True: for pat, repl, term in replacements: if pat in text: text = text.replace(pat, repl, 1) if term: return text break else: return text
if __name__ == '__main__':
assert replace(text1, extractreplacements(grammar1)) \ == 'I bought a bag of apples from my brother.' assert replace(text1, extractreplacements(grammar2)) \ == 'I bought a bag of apples from T shop.' # Stretch goals assert replace(text2, extractreplacements(grammar3)) \ == 'I bought a bag of apples with my money from T shop.' assert replace(text3, extractreplacements(grammar4)) \ == '11111111111111111111'
</lang>
Ruby
<lang Ruby>raise "Please input an input code file, an input data file, and an output file." if ARGV.size < 3
rules = File.readlines(ARGV[0]).inject([]) do |rules, line|
if line =~ /^\s*#/ rules elsif line =~ /^(.+)\s+->\s+(\.?)(.*)$/ rules << [$1, $3, $2 != ""] else raise "Syntax error: #{line}" end
end
File.open(ARGV[2], "w") do |file|
file.write(File.read(ARGV[1]).tap { |input_data| while (matched = rules.find { |match, replace, term| input_data[match] and input_data.sub!(match, replace) }) and !matched[2] end })
end</lang>
Tcl
<lang tcl>package require Tcl 8.5 if {$argc < 3} {error "usage: $argv0 ruleFile inputFile outputFile"} lassign $argv ruleFile inputFile outputFile
- Read the file of rules
set rules {} set f [open $ruleFile] foreach line [split [read $f] \n[close $f]] {
if {[string match "#*" $line] || $line eq ""} continue if {[regexp {^(.+)\s+->\s+(\.?)(.*)$} $line -> from final to]} {
lappend rules $from $to [string compare "." $final] [string length $from]
} else {
error "Syntax error: \"$line\""
}
}
- Apply the rules
set f [open $inputFile] set out [open $outputFile w] foreach line [split [read $f] \n[close $f]] {
set any 1 while {$any} {
set any 0 foreach {from to more fl} $rules { # If we match the 'from' pattern... if {[set idx [string first $from $line]] >= 0} { # Change for the 'to' replacement set line [string replace $line $idx [expr {$idx+$fl-1}] $to]
# Stop if we terminate, otherwise note that we've more work to do
set any $more
break; # Restart search for rules to apply } }
#DEBUG# puts $line }
# Output the processed line puts $out $line
} close $out</lang> In the case where there are no terminating rules and no overlapping issues, the following is an alternative: <lang tcl>package require Tcl 8.5 if {$argc < 3} {error "usage: $argv0 ruleFile inputFile outputFile"} lassign $argv ruleFile inputFile outputFile
- Read the file of rules
set rules {} set f [open $ruleFile] foreach line [split [read $f] \n[close $f]] {
if {[string match "#*" $line] || $line eq ""} continue if {[regexp {^(.+)\s+->\s+(.*)$} $line -> from to]} { dict set rules $from $to } else {
error "Syntax error: \"$line\""
}
}
- Apply the rules in a simplistic manner
set in [open $inputFile] set out [open $outputFile w] set data [read $in] close $in while 1 {
set newData [string map $rules $data] if {$newData eq $data} break set data $newData
} puts $out $data close $out</lang>