Word break problem

From Rosetta Code
Word break problem is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.
Task
Given an input string and a dictionary of words, segment the input string into a space-separated sequence of dictionary words if possible.

Aime[edit]

integer
wordbreak(record dict, text phrase, integer p, list words)
{
integer complete, n;
text s;
 
complete = 0;
if (rsk_lower(dict, phrase, s)) {
if (s == phrase) {
l_append(words, s);
complete = 1;
} else {
do {
n = 0;
while (phrase[n] == s[n]) {
n += 1;
}
if (!n) {
break;
}
l_append(words, cut(s, 0, n));
complete = wordbreak(dict, project(phrase, n), p + 1, words);
if (complete) {
break;
}
l_delete(words, -1);
} while (rsk_less(dict, s, s));
}
}
 
if (!p) {
o_(phrase, ":");
if (complete) {
l_ucall(words, o_, 1, " ");
} else {
o_(" can't break");
}
o_newline();
 
l_clear(words);
}
 
return complete;
}
 
 
integer
main(void)
{
record dict;
 
r_fit(dict, "a", 0, "bc", 0, "abc", 0, "cd", 0, "b", 0);
l_ucall(l_effect("abcd", "abbc", "abcbcd", "acdbc", "abcdd"), wordbreak, 1, dict, 0, l_effect());
 
return 0;
}
Output:
abcd: a b cd
abbc: a b bc
abcbcd: abc b cd
acdbc: a cd bc
abcdd: can't break

Julia[edit]

Some extra loops to record and print all solutions.
 
words = ["a", "bc", "abc", "cd", "b"]
strings = ["abcd", "abbc", "abcbcd", "acdbc", "abcdd"]
 
subregex = join(words, ")|(")
regexes = ["\^\(\($subregex\)\)\{$i}\$" for i in 6:-1:1]
 
function wordbreak()
for s in strings
solutions = []
for regex in regexes
rmat = match(Regex(regex), s)
if rmat != nothing
push!(solutions, ["$w" for w in Set(rmat.captures) if w != nothing])
end
end
if length(solutions) > 0
println("$(length(solutions)) Solution(s) for $s:")
for sol in solutions
println(" Solution: $(sol)")
end
else
println("No solutions for $s : No fitting matches found.")
end
end
end
 
wordbreak()
Output:

1 Solution(s) for abcd:

  Solution: SubString{String}["cd", "b", "a"]

1 Solution(s) for abbc:

  Solution: SubString{String}["b", "a", "bc"]

2 Solution(s) for abcbcd:

  Solution: SubString{String}["cd", "b", "a", "bc"]
  Solution: SubString{String}["cd", "abc", "b"]

1 Solution(s) for acdbc:

  Solution: SubString{String}["cd", "a", "bc"]
No solutions for abcdd : No fitting matches found.

Kotlin[edit]

I've downloaded the free dictionary at http://www.puzzlers.org/pub/wordlists/unixdict.txt for this task. All single letters from 'a' to 'z' are considered to be words by this dictionary but 'bc' and 'cd' which I'd have expected to be present are not.

// version 1.1.3
 
import java.io.File
 
val partitions = mutableListOf<List<String>>()
 
fun partitionString(s: String, ml: MutableList<String>, level: Int) {
for (i in s.length - 1 downTo 1) {
val part1 = s.substring(0, i)
val part2 = s.substring(i)
ml.add(part1)
ml.add(part2)
partitions.add(ml.toList())
if (part2.length > 1) {
ml.removeAt(ml.lastIndex)
partitionString(part2, ml, level + 1)
}
while (ml.size > level) ml.removeAt(ml.lastIndex)
}
}
 
fun main(args: Array<String>) {
val words = File("unixdict.txt").readLines()
val strings = listOf("abcd", "abbc", "abcbcd", "acdbc", "abcdd")
for (s in strings) {
partitions.clear()
partitions.add(listOf(s))
val ml = mutableListOf<String>()
partitionString(s, ml, 0)
val solutions = mutableListOf<List<String>>()
for (partition in partitions) {
var allInDict = true
for (item in partition) {
if (words.indexOf(item) == -1) {
allInDict = false
break
}
}
if (allInDict) solutions.add(partition)
}
val plural = if (solutions.size == 1) "" else "s"
println("$s: ${solutions.size} solution$plural")
for (solution in solutions) {
println(" ${solution.joinToString(" ")}")
}
println()
}
}
Output:
abcd: 2 solutions
    abc d
    a b c d

abbc: 1 solution
    a b b c

abcbcd: 3 solutions
    abc b c d
    a b cb c d
    a b c b c d

acdbc: 2 solutions
    ac d b c
    a c d b c

abcdd: 2 solutions
    abc d d
    a b c d d

Lua[edit]

-- a specialized dict format is used to minimize the 
-- possible candidates for this particalur problem
function genDict(ws)
local d,dup,head,rest = {},{}
for w in ws:gmatch"%w+" do
local lw = w:lower()
if not dup[lw] then
dup[lw], head,rest = true, lw:match"^(%w)(.-)$"
d[head] = d[head] or {n=-1}
local len = #rest
d[head][len] = d[head][len] or {}
d[head][len][rest] = true
if len>d[head].n then
d[head].n = len
end
end
end
return d
end
 
-- sample default dict
local defWords = "a;bc;abc;cd;b"
local defDict = genDict(defWords)
 
function wordbreak(w, dict)
if type(w)~='string' or w:len()==0 then
return nil,'emprty or not a string'
end
 
dict = type(dict)=='string' and genDict(dict) or dict or defDict
 
local r, len = {}, #w
 
-- backtracking
local function try(i)
if i>len then return true end
local head = w:sub(i,i):lower()
local d = dict[head]
if not d then return end
for j=math.min(d.n, len-i),0,-1 do -- prefer longer first
if d[j] then
local rest = w:sub(i+1,i+j):lower()
if d[j][rest] then
r[1+#r] = w:sub(i,i+j)
if try(i+j+1) then
return true
else
r[#r]=nil
end
end
end
end
end
 
if try(1) then
return table.unpack(r)
else
return nil,'-no solution-'
end
end
 
-- test
local test = {'abcd','abbc','abcbcd','acdbc','abcdd' }
for i=1,#test do
print(test[i],wordbreak(test[i]))
end
Output:
abcd	a	b	cd
abbc	a	b	bc
abcbcd	abc	b	cd
acdbc	a	cd	bc
abcdd	nil	-no solution-

Perl 6[edit]

Works with: Rakudo version 2017.04

This implementation does not necessarily find every combination, it returns the one with the longest matching tokens.

my @words = <a bc abc cd b>;
my $regex = @words.join('|');
 
put "$_: ", word-break($_) for <abcd abbc abcbcd acdbc abcdd>;
 
sub word-break (Str $word) { ($word ~~ / ^ (<$regex>)+ $ /)[0] // "Not possible" }
Output:
abcd: a b cd
abbc: a b bc
abcbcd: abc b cd
acdbc: a cd bc
abcdd: Not possible

Phix[edit]

See talk page

procedure populate_dict(sequence s)
?s
for i=1 to length(s) do setd(s[i],0) end for
end procedure
 
--/*
function valid_word(string word)
if length(word)=1 then return find(word,{"a","i"})!=0 end if
if find(word,{"sis","sst","se"}) then return false end if -- hack
for i=1 to length(word) do
integer ch = word[i]
if ch<'a'
or ch>'z' then
return false
end if
end for
return true
end function
--*/
 
populate_dict(split("a bc abc cd b"))
--/*
integer fn = open("demo\\unixdict.txt","r")
sequence words = get_text(fn,GT_LF_STRIPPED)
close(fn)
for i=length(words) to 1 by -1 do
if not valid_word(words[i]) then
words[i] = words[$]
words = words[1..$-1]
end if
end for
populate_dict(words)
--*/
 
function prrec(sequence wordstarts, integer idx, sequence sofar, bool show)
if idx>length(wordstarts) then
if show then
 ?sofar
end if
return 1
end if
integer res = 0
for i=1 to length(wordstarts[idx]) do
string w = wordstarts[idx][i]
res += prrec(wordstarts,idx+length(w),append(sofar,w),show)
end for
return res
end function
 
function flattens(sequence s)
-- remove all nesting and empty sequences from a nested sequence of strings
sequence res = {}, si
for i=1 to length(s) do
si = s[i]
if string(si) then
res = append(res,si)
else
res &= flattens(si)
end if
end for
return res
end function
 
procedure test(string s)
integer l = length(s)
sequence wordstarts = repeat({},l), wordends = repeat(0,l)
integer wordend = 1 -- (pretend a word just ended at start)
for i=1 to l do
if wordend then
for j=i to l do
object pkey = getd_partial_key(s[i..j])
if string(pkey) and length(pkey)>j-i and s[i..j]=pkey[1..j-i+1] then
if length(pkey)=j-i+1 then
-- exact match
wordstarts[i] = append(wordstarts[i],pkey)
wordends[j] += 1
end if
else
exit
end if
end for
end if
wordend = wordends[i]
end for
bool worthwhile = true
while worthwhile do
worthwhile = false
wordend = 1 -- (pretend a word just ended at start)
for i=1 to l do
if wordend then
-- eliminate any words that end before a wordstarts of {}.
for j=length(wordstarts[i]) to 1 by -1 do
integer wl = length(wordstarts[i][j])
if i+wl<=l and wordstarts[i+wl]={} then
wordends[i+wl-1] -= 1
wordstarts[i][j..j] = {}
worthwhile = true
end if
end for
else
-- elimitate all words that start here.
for j=1 to length(wordstarts[i]) do
integer wl = length(wordstarts[i][j])
if i+wl<=l then
wordends[i+wl-1] -= 1
worthwhile = true
end if
end for
wordstarts[i] = {}
end if
wordend = wordends[i]
end for
end while
--?{wordstarts,wordends}
if sum(wordends)=0 then
printf(1,"%s: not possible",{s})
else
integer count = prrec(wordstarts,1,{},false)
if count=1 then
printf(1,"%s: 1 solution: %s\n",{s,join(flattens(wordstarts))})
elsif count>20 then
printf(1,"%s: %d solution(s): (too many to show)\n",{s,count})
pp({wordstarts,wordends})
else
printf(1,"%s: %d solution(s):\n",{s,count})
count = prrec(wordstarts,1,{},true)
end if
end if
end procedure
 
constant tests = {"abcd","abbc","abcbcd","acdbc","abcdd"}
--constant tests = {"wordsisstringofspaceseparatedwords"}
for i=1 to length(tests) do test(tests[i]) end for
Output:
{"a","bc","abc","cd","b"}
abcd: 1 solution: a b cd
abbc: 1 solution: a b bc
abcbcd: 2 solution(s):
{"a","bc","b","cd"}
{"abc","b","cd"}
acdbc: 1 solution: a cd bc
abcdd: not possible

PicoLisp[edit]

(setq *Dict (quote "a" "bc" "abc" "cd" "b"))
(setq *Dict2
(quote
"mobile" "samsung" "sam" "sung" "man" "mango"
"icecream" "and" "go" "i" "like" "ice" "cream" ) )
 
(de word (Str D)
(let
(Str (chop Str)
Len (length Str)
DP (need (inc Len))
Res (need (inc Len))
B 1 )
(set DP 0)
(map
'((L)
(and
(get DP B)
(for N (length L)
(let Str (pack (head N L))
(when (member Str D)
(set (nth Res (+ B N))
(copy (get Res B)) )
(queue (nth Res (+ B N)) Str)
(set (nth DP (+ B N))
(inc (get DP B)) ) ) ) ) )
(inc 'B) )
Str )
(last Res) ) )
 
(println (word "abcd" *Dict))
(println (word "abbc" *Dict))
(println (word "abcbcd" *Dict))
(println (word "acdbc" *Dict))
(println (word "abcdd" *Dict))
(println (word "ilikesamsung" *Dict2))
(println (word "iii" *Dict2))
(println (word "ilikelikeimangoiii" *Dict2))
(println (word "samsungandmango" *Dict2))
(println (word "samsungandmangok" *Dict2))
(println (word "ksamsungandmango" *Dict2))
Output:
("a" "b" "cd")
("a" "b" "bc")
("a" "bc" "b" "cd")
("a" "cd" "bc")
NIL
("i" "like" "sam" "sung")
("i" "i" "i")
("i" "like" "like" "i" "man" "go" "i" "i" "i")
("sam" "sung" "and" "man" "go")
NIL
NIL

REXX[edit]

This REXX version allows the words to be tested (and the dictionary words) to be specified on the command line.

/*REXX program breaks up a  word (or string)  into a  list of words  from a dictionary. */
parse arg a '/' x; a=space(a); x=space(x) /*get optional args; elide extra blanks*/
if a=='' | a=="," then a= 'abcd abbc abcbcd acdbc abcdd' /*maybe use the defaults ? */
if x=='' | x=="," then x= 'a bc abc cd b' /* " " " " */
na=words(a) /*the number of words to be tested. */
nx=words(x) /* " " " " " the dictionary*/
say nx ' dictionary words: ' x /*display the words in the dictionary. */
say /*display a blank line to the terminal.*/
aw=0; do i=1 for na; _=word(a, i) /*obtain a word that will be tested. */
aw=max(aw, length(_) ) /*find widest width word being tested. */
end /*i*/ /* [↑] AW is used to align the output*/
@.=0 /*initialize the dictionary to "null". */
xw=0; do i=1 for nx; _=word(x, i) /*obtain a word from the dictionary. */
xw=max(xw, length(_) ); @._=1 /*find widest width dictionary word. */
end /*i*/ /* [↑] define a dictionary word. */
p=0 /* [↓] process a word in the A list.*/
do j=1 for na; yy=word(a, j) /*YY: test a word from the A list.*/
do t=(nx+1)**(xw+1) by -1 to 1 until y==''; y=yy /*try word possibility.*/
$= /*nullify the (possible) result list. */
do try=t while y\='' /*keep testing until Y is exhausted. */
p=(try + p) // xw + 1 /*use a possible width for this attempt*/
p=fw(y, p); if p==0 then iterate t /*is this part of the word not found ? */
$=$ ? /*It was found. Add partial to the list*/
y=substr(y, p + 1) /*now, use and test the rest of word. */
end /*try*/
end /*t*/
 
if t==0 then $= '(not possible)' /*indicate that the word isn't possible*/
say right(yy, aw) '───►' strip($) /*display the result to the terminal. */
end /*j*/
exit /*stick a fork in it, we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
fw: parse arg z,L; do k=L by -1 for L; ?=left(z,k); if @.? then leave; end; return k
output   when using the default inputs:
5  dictionary words:  a bc abc cd b

  abcd ───► a b cd
  abbc ───► a b bc
abcbcd ───► abc b cd
 acdbc ───► a cd bc
 abcdd ───► (not possible)

Rust[edit]

Dynamic programming[edit]

use std::collections::HashSet;
fn create_string(s: &str, v: Vec<Option<usize>>) -> String {
let mut idx = s.len();
let mut slice_vec = vec![];
while let Some(prev) = v[idx] {
slice_vec.push(&s[prev..idx]);
idx = prev;
}
slice_vec.reverse();
slice_vec.join(" ")
 
 
}
 
fn word_break(s: &str, dict: HashSet<&str>) -> Option<String> {
let size = s.len() + 1;
let mut possible = vec![None; size];
 
let check_word = |i,j| dict.get(&s[i..j]).map(|_| i);
 
for i in 1..size {
possible[i] = possible[i].or_else(|| check_word(0,i));
 
if possible[i].is_some() {
for j in i+1..size {
possible[j] = possible[j].or_else(|| check_word(i,j));
}
 
if possible[s.len()].is_some() {
return Some(create_string(s, possible));
}
 
};
}
None
}
 
fn main() {
let mut set = HashSet::new();
set.insert("a");
set.insert("bc");
set.insert("abc");
set.insert("cd");
set.insert("b");
println!("{:?}", word_break("abcd", set).unwrap());
}
Output:
"a b cd"

zkl[edit]

fcn wordBreak(str,words){	// words is string of space seperated words
words=words.split(" "); // to list of words
r:=fcn(str,words,sink){ // recursive search, easy to collect answer
foreach word in (words){
if(not str) return(True); // consumed string ie matched everything
if(str.find(word)==0){ // word starts str, 0 so answer is ordered
z:=word.len();
if(self.fcn(str.del(0,z),words,sink)) return(sink.write(word));
}
}
False // can't make forward progress, back out & retry
}(str,words,List()); // run the lambda
if(False==r) return("not possible");
r.reverse().concat(" ")
}
foreach text in (T("abcd","abbc","abcbcd","acdbc","abcdd")){
println(text,": ",wordBreak(text,"a bc abc cd b"))
}
Output:
abcd: a b cd
abbc: a b bc
abcbcd: a bc b cd
acdbc: a cd bc
abcdd: not possible