Word break problem
- Task
- Given an input string and a dictionary of words, segment the input string into a space-separated sequence of dictionary words if possible.
Aime
<lang aime>integer wordbreak(record dict, text phrase, integer p, list words) {
integer complete, n; text s;
complete = 0; if (rsk_lower(dict, phrase, s)) { if (s == phrase) { l_append(words, s); complete = 1; } else { do { n = 0; while (phrase[n] == s[n]) { n += 1; } if (!n) { break; } l_append(words, cut(s, 0, n)); complete = wordbreak(dict, project(phrase, n), p + 1, words); if (complete) { break; } l_delete(words, -1); } while (rsk_less(dict, s, s)); } }
if (!p) { o_(phrase, ":"); if (complete) { l_ucall(words, o_, 1, " "); } else { o_(" can't break"); } o_newline();
l_clear(words); }
return complete;
}
integer
main(void)
{
record dict;
r_fit(dict, "a", 0, "bc", 0, "abc", 0, "cd", 0, "b", 0); l_ucall(l_effect("abcd", "abbc", "abcbcd", "acdbc", "abcdd"), wordbreak, 1, dict, 0, l_effect());
return 0;
}</lang>
- Output:
abcd: a b cd abbc: a b bc abcbcd: abc b cd acdbc: a cd bc abcdd: can't break
Perl 6
This implementation does not necessarily find every combination, it returns the one with the longest matching tokens.
<lang perl6>my @words = <a bc abc cd b>; my $regex = @words.join('|');
put "$_: ", word-break($_) for <abcd abbc abcbcd acdbc abcdd>;
sub word-break (Str $word) { ($word ~~ / ^ (<$regex>)+ $ /)[0] // "Not possible" }</lang>
- Output:
abcd: a b cd abbc: a b bc abcbcd: abc b cd acdbc: a cd bc abcdd: Not possible
Phix
See talk page <lang Phix>procedure populate_dict(sequence s)
for i=1 to length(s) do setd(s[i],0) end for
end procedure
function valid_word(string word)
if length(word)=1 then return find(word,{"a","i"})!=0 end if if find(word,{"sis","sst","se"}) then return false end if for i=1 to length(word) do integer ch = word[i] if ch<'a' or ch>'z' then return false end if end for return true
end function
--populate_dict(split("a bc abc cd b")) integer fn = open("demo\\unixdict.txt","r") sequence words = get_text(fn,GT_LF_STRIPPED) close(fn) for i=length(words) to 1 by -1 do
if not valid_word(words[i]) then words[i] = words[$] words = words[1..$-1] end if
end for populate_dict(words)
function prrec(sequence wordstarts, integer idx, sequence sofar, bool show)
if idx>length(wordstarts) then if show then ?sofar end if return 1 end if integer res = 0 for i=1 to length(wordstarts[idx]) do string w = wordstarts[idx][i] res += prrec(wordstarts,idx+length(w),append(sofar,w),show) end for return res
end function
function flattens(sequence s) -- remove all nesting and empty sequences from a nested sequence of strings sequence res = {}, si
for i=1 to length(s) do si = s[i] if string(si) then res = append(res,si) else res &= flattens(si) end if end for return res
end function
procedure test(string s)
integer l = length(s)
sequence wordstarts = repeat({},l), wordends = repeat(0,l)
integer wordend = 1 -- (pretend a word just ended at start)
for i=1 to l do if wordend then for j=i to l do object pkey = getd_partial_key(s[i..j]) if string(pkey) and length(pkey)>j-i and s[i..j]=pkey[1..j-i+1] then if length(pkey)=j-i+1 then -- exact match wordstarts[i] = append(wordstarts[i],pkey) wordends[j] += 1 end if else exit end if end for end if wordend = wordends[i] end for bool worthwhile = true while worthwhile do worthwhile = false wordend = 1 -- (pretend a word just ended at start) for i=1 to l do if wordend then -- eliminate any words that end before a wordstarts of {}. for j=length(wordstarts[i]) to 1 by -1 do integer wl = length(wordstarts[i][j]) if i+wl<=l and wordstarts[i+wl]={} then wordends[i+wl-1] -= 1 wordstarts[i][j..j] = {} worthwhile = true end if end for else -- elimitate all words that start here. for j=1 to length(wordstarts[i]) do integer wl = length(wordstarts[i][j]) if i+wl<=l then wordends[i+wl-1] -= 1 worthwhile = true end if end for wordstarts[i] = {} end if wordend = wordends[i] end for end while
--?{wordstarts,wordends}
if sum(wordends)=0 then printf(1,"%s: not possible",{s}) else integer count = prrec(wordstarts,1,{},false) if count=1 then printf(1,"%s: 1 solution: %s\n",{s,join(flattens(wordstarts))}) elsif count>20 then printf(1,"%s: %d solution(s): (too many to show)\n",{s,count}) pp({wordstarts,wordends}) else printf(1,"%s: %d solution(s):\n",{s,count}) count = prrec(wordstarts,1,{},true) end if end if
end procedure
--constant tests = {"abcd","abbc","abcbcd","acdbc","abcdd"} constant tests = {"wordsisstringofspaceseparatedwords"} for i=1 to length(tests) do test(tests[i]) end for</lang>
REXX
This REXX version allows the words to be tested (and the dictionary words) to be specified on the command line. <lang rexx>/*REXX program breaks up a word (or string) into a list of words from a dictionary. */ parse arg a '/' x; $=space($); x=space(x) /*get optional args; elide extra blanks*/ if a== | a=="," then a='abcd abbc abcbcd acdbc abcdd' /*maybe use the defaults ? */ if x== | x=="," then x='a bc abc cd b' /* " " " " */ na=words(a) /*the number of words to be tested. */ nx=words(x) /* " " " " " the dictionary*/ say nx 'dictionary words:' x /*display the words in the dictionary. */ say /*display a blank line to the terminal.*/ aw=0; do i=1 for na; _=word(a,i) /*obtain a word that will be tested. */
aw=max(aw,length(_)) /*find widest width word being tested. */ end /*i*/ /* [↑] AW is used to align the output*/
@.=0 /*initialize the dictionary to "null". */ xw=0; do i=1 for nx; _=word(x,i) /*obtain a word from the dictionary. */
xw=max(xw,length(_)); @._=1 /*find widest width dictionary word. */ end /*i*/ /* [↑] define a dictionary word. */
p=0 /* [↓] process a word in the A list.*/
do j=1 for na; yy=word(a,j) /*YY: test a word from the A list.*/ do t=(nx+1)**(xw+1) by -1 to 1 until y==; y=yy /*try a word possibility.*/ $= /*nullify the (possible) result list. */ do try=t while y\= /*keep testing until Y is exhausted. */ p=(try+p)//xw+1 /*use a possible width for this attempt*/ p=fw(y,p); if p==0 then iterate t /*is this part of the word not found ? */ $=$ ? /*It was found. Add partial to the list*/ y=substr(y,p+1) /*now, use and test the rest of word. */ end /*try*/ end /*t*/
if t==0 then $= '(not possible)' /*indicate that the word isn't possible*/ say right(yy,aw) '───►' strip($) /*display the result to the terminal. */ end /*j*/
exit /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ fw: parse arg z,L; do k=L by -1 for L; ?=left(z,k); if @.? then leave; end; return k</lang>
- output when using the default inputs:
5 dictionary words: a bc abc cd b abcd ───► a b cd abbc ───► a b bc abcbcd ───► abc b cd acdbc ───► a cd bc abcdd ───► (not possible)
Rust
Dynamic programming
<lang rust>use std::collections::HashSet; fn create_string(s: &str, v: Vec<Option<usize>>) -> String {
let mut idx = s.len(); let mut slice_vec = vec![]; while let Some(prev) = v[idx] { slice_vec.push(&s[prev..idx]); idx = prev; } slice_vec.reverse(); slice_vec.join(" ")
}
fn word_break(s: &str, dict: HashSet<&str>) -> Option<String> {
let size = s.len() + 1; let mut possible = vec![None; size];
let check_word = |i,j| dict.get(&s[i..j]).map(|_| i);
for i in 1..size { possible[i] = possible[i].or_else(|| check_word(0,i));
if possible[i].is_some() { for j in i+1..size { possible[j] = possible[j].or_else(|| check_word(i,j)); }
if possible[s.len()].is_some() { return Some(create_string(s, possible)); }
}; } None
}
fn main() {
let mut set = HashSet::new(); set.insert("a"); set.insert("bc"); set.insert("abc"); set.insert("cd"); set.insert("b"); println!("{:?}", word_break("abcd", set).unwrap());
}</lang>
- Output:
"a b cd"
zkl
<lang zkl>fcn wordBreak(str,words){ // words is string of space seperated words
words=words.split(" "); // to list of words r:=fcn(str,words,sink){ // recursive search, easy to collect answer foreach word in (words){
if(not str) return(True); // consumed string ie matched everything if(str.find(word)==0){ // word starts str, 0 so answer is ordered z:=word.len(); if(self.fcn(str.del(0,z),words,sink)) return(sink.write(word)); }
} False // can't make forward progress, back out & retry }(str,words,List()); // run the lambda if(False==r) return("not possible"); r.reverse().concat(" ")
}</lang> <lang zkl>foreach text in (T("abcd","abbc","abcbcd","acdbc","abcdd")){
println(text,": ",wordBreak(text,"a bc abc cd b"))
}</lang>
- Output:
abcd: a b cd abbc: a b bc abcbcd: a bc b cd acdbc: a cd bc abcdd: not possible