Word break problem: Difference between revisions

Content added Content deleted

Inline

Revision as of 14:39, 26 May 2018

Task: Given an input string and a dictionary of words, segment the input string into a space-separated sequence of dictionary words if possible.

Aime

<lang aime>integer wordbreak(record dict, text phrase, integer p, list words) {

   integer complete, n;
   text s;

   complete = 0;
   if (rsk_lower(dict, phrase, s)) {
       if (s == phrase) {
           words.append(s);
           complete = 1;
       } else {
           do {
               n = 0;
               while (phrase[n] == s[n]) {
                   n += 1;
               }
               if (!n) {
                   break;
               }
               words.append(cut(s, 0, n));
               complete = wordbreak(dict, project(phrase, n), p + 1, words);
               if (complete) {
                   break;
               }
               words.delete(-1);
           } while (rsk_less(dict, s, s));
       }
   }

   if (!p) {
       o_(phrase, ":");
       if (complete) {
           words.ucall(o_, 1, " ");
       } else {
           o_(" can't break");
       }
       o_newline();

       words.clear;
   }

   complete;

}

integer main(void) {

   record dict;

   dict.fit("a", 0, "bc", 0, "abc", 0, "cd", 0, "b", 0);
   list("abcd", "abbc", "abcbcd", "acdbc", "abcdd").ucall(wordbreak, 1, dict, 0, list());

   return 0;

}</lang>

Output:

abcd: a b cd
abbc: a b bc
abcbcd: abc b cd
acdbc: a cd bc
abcdd: can't break

Go

<lang go>package main

import (

   "fmt"
   "strings"

)

type dict map[string]bool

func newDict(words ...string) dict {

   d := dict{}
   for _, w := range words {
       d[w] = true
   }
   return d

}

func (d dict) wordBreak(s string) (broken []string, ok bool) {

   if s == "" {
       return nil, true
   }
   type prefix struct {
       length int
       broken []string
   }
   bp := []prefixTemplate:0, nil
   for end := 1; end <= len(s); end++ {
       for i := len(bp) - 1; i >= 0; i-- {
           w := s[bp[i].length:end]
           if d[w] {
               b := append(bp[i].broken, w)
               if end == len(s) {
                   return b, true
               }
               bp = append(bp, prefix{end, b})
               break
           }
       }
   }
   return nil, false

}

func main() {

   d := newDict("a", "bc", "abc", "cd", "b")
   for _, s := range []string{"abcd", "abbc", "abcbcd", "acdbc", "abcdd"} {
       if b, ok := d.wordBreak(s); ok {
           fmt.Printf("%s: %s\n", s, strings.Join(b, " "))
       } else {
           fmt.Println("can't break")
       }
   }

}</lang>

Output:

abcd: a b cd
abbc: a b bc
abcbcd: a bc b cd
acdbc: a cd bc
can't break

Julia

Some extra loops to record and print all solutions.<lang Julia> words = ["a", "bc", "abc", "cd", "b"] strings = ["abcd", "abbc", "abcbcd", "acdbc", "abcdd"]

subregex = join(words, ")|(") regexes = ["\^$\($subregex$\)\{$i}\$" for i in 6:-1:1]

function wordbreak()

   for s in strings
       solutions = []
       for regex in regexes
           rmat = match(Regex(regex), s)
           if rmat != nothing
               push!(solutions, ["$w" for w in Set(rmat.captures) if w != nothing])
           end
       end
       if length(solutions) > 0
           println("$(length(solutions)) Solution(s) for $s:")
           for sol in solutions
               println("   Solution: $(sol)")
           end
       else
           println("No solutions for $s : No fitting matches found.")
       end
   end

end

wordbreak()</lang>

Output:


1 Solution(s) for abcd:

  Solution: SubString{String}["cd", "b", "a"]

1 Solution(s) for abbc:

  Solution: SubString{String}["b", "a", "bc"]

2 Solution(s) for abcbcd:

  Solution: SubString{String}["cd", "b", "a", "bc"]
  Solution: SubString{String}["cd", "abc", "b"]

1 Solution(s) for acdbc:

  Solution: SubString{String}["cd", "a", "bc"]

No solutions for abcdd : No fitting matches found.

I've downloaded the free dictionary at http://www.puzzlers.org/pub/wordlists/unixdict.txt for this task. All single letters from 'a' to 'z' are considered to be words by this dictionary but 'bc' and 'cd' which I'd have expected to be present are not. <lang scala>// version 1.1.3

import java.io.File

val partitions = mutableListOf<List<String>>()

fun partitionString(s: String, ml: MutableList<String>, level: Int) {

   for (i in s.length - 1 downTo 1) {
       val part1 = s.substring(0, i)
       val part2 = s.substring(i)
       ml.add(part1)
       ml.add(part2)
       partitions.add(ml.toList())
       if (part2.length > 1) {
           ml.removeAt(ml.lastIndex)
           partitionString(part2, ml, level + 1)
       }
       while (ml.size > level) ml.removeAt(ml.lastIndex)
   }

}

fun main(args: Array<String>) {

   val words = File("unixdict.txt").readLines()
   val strings = listOf("abcd", "abbc", "abcbcd", "acdbc", "abcdd")
   for (s in strings) {
       partitions.clear()
       partitions.add(listOf(s))
       val ml = mutableListOf<String>()
       partitionString(s, ml, 0)
       val solutions = mutableListOf<List<String>>()
       for (partition in partitions) {
           var allInDict = true
           for (item in partition) {
               if (words.indexOf(item) == -1) {
                   allInDict = false
                   break
               }
           }
           if (allInDict) solutions.add(partition)
       }
       val plural = if (solutions.size == 1) "" else "s"
       println("$s: ${solutions.size} solution$plural")
       for (solution in solutions) {
           println("    ${solution.joinToString(" ")}")
       }
       println() 
   }

}</lang>

Output:

abcd: 2 solutions
    abc d
    a b c d

abbc: 1 solution
    a b b c

abcbcd: 3 solutions
    abc b c d
    a b cb c d
    a b c b c d

acdbc: 2 solutions
    ac d b c
    a c d b c

abcdd: 2 solutions
    abc d d
    a b c d d

Lua

<lang lua>-- a specialized dict format is used to minimize the -- possible candidates for this particalur problem function genDict(ws)

 local d,dup,head,rest = {},{}
 for w in ws:gmatch"%w+" do
   local lw = w:lower()
   if not dup[lw] then
     dup[lw], head,rest = true, lw:match"^(%w)(.-)$"
     d[head] = d[head] or {n=-1}
     local len = #rest
     d[head][len] = d[head][len] or {}
     d[head][len][rest] = true
     if len>d[head].n then 
       d[head].n = len 
     end
   end
 end
 return d

end

-- sample default dict local defWords = "a;bc;abc;cd;b" local defDict = genDict(defWords)

function wordbreak(w, dict)

 if type(w)~='string' or w:len()==0 then 
   return nil,'emprty or not a string'
 end
 
 dict = type(dict)=='string' and genDict(dict) or dict or defDict
 
 local r, len = {}, #w
 
 -- backtracking
 local function try(i)
   if i>len then return true end
   local head = w:sub(i,i):lower()
   local d = dict[head]
   if not d then return end
   for j=math.min(d.n, len-i),0,-1 do -- prefer longer first
     if d[j] then
       local rest = w:sub(i+1,i+j):lower()
       if d[j][rest] then
         r[1+#r] = w:sub(i,i+j)
         if try(i+j+1) then 
           return true 
         else 
           r[#r]=nil 
         end
       end            
     end
   end        
 end
 
 if try(1) then 
   return table.unpack(r) 
 else 
   return nil,'-no solution-'
 end

end

-- test local test = {'abcd','abbc','abcbcd','acdbc','abcdd' } for i=1,#test do

 print(test[i],wordbreak(test[i]))

end</lang>

Output:

abcd	a	b	cd
abbc	a	b	bc
abcbcd	abc	b	cd
acdbc	a	cd	bc
abcdd	nil	-no solution-

Perl 6

Works with: Rakudo version 2017.04

This implementation does not necessarily find every combination, it returns the one with the longest matching tokens.

<lang perl6>my @words = <a bc abc cd b>; my $regex = @words.join('|');

put "$_: ", word-break($_) for <abcd abbc abcbcd acdbc abcdd>;

sub word-break (Str $word) { ($word ~~ / ^ (<$regex>)+ $ /)[0] // "Not possible" }</lang>

Output:

abcd: a b cd
abbc: a b bc
abcbcd: abc b cd
acdbc: a cd bc
abcdd: Not possible

Phix

See talk page <lang Phix>procedure populate_dict(sequence s) ?s

   for i=1 to length(s) do setd(s[i],0) end for

end procedure

--/* function valid_word(string word)

   if length(word)=1 then return find(word,{"a","i"})!=0 end if
   if find(word,{"sis","sst","se"}) then return false end if  -- hack
   for i=1 to length(word) do
       integer ch = word[i]
       if ch<'a'
       or ch>'z' then
           return false
       end if
   end for
   return true

end function --*/

populate_dict(split("a bc abc cd b")) --/* integer fn = open("demo\\unixdict.txt","r") sequence words = get_text(fn,GT_LF_STRIPPED) close(fn) for i=length(words) to 1 by -1 do

   if not valid_word(words[i]) then
       words[i] = words[$]
       words = words[1..$-1]
   end if

end for populate_dict(words) --*/

function prrec(sequence wordstarts, integer idx, sequence sofar, bool show)

   if idx>length(wordstarts) then
       if show then
           ?sofar
       end if
       return 1
   end if
   integer res = 0
   for i=1 to length(wordstarts[idx]) do
       string w = wordstarts[idx][i]
       res += prrec(wordstarts,idx+length(w),append(sofar,w),show)
   end for
   return res

end function

function flattens(sequence s) -- remove all nesting and empty sequences from a nested sequence of strings sequence res = {}, si

   for i=1 to length(s) do
       si = s[i]
       if string(si) then
           res = append(res,si)
       else
           res &= flattens(si)
       end if
   end for
   return res

end function

procedure test(string s) integer l = length(s) sequence wordstarts = repeat({},l), wordends = repeat(0,l) integer wordend = 1 -- (pretend a word just ended at start)

   for i=1 to l do
       if wordend then
           for j=i to l do
               object pkey = getd_partial_key(s[i..j])
               if string(pkey) and length(pkey)>j-i and s[i..j]=pkey[1..j-i+1] then
                   if length(pkey)=j-i+1 then
                       -- exact match
                       wordstarts[i] = append(wordstarts[i],pkey)
                       wordends[j] += 1
                   end if
               else
                   exit
               end if  
           end for
       end if
       wordend = wordends[i]
   end for
   bool worthwhile = true
   while worthwhile do
       worthwhile = false
       wordend = 1 -- (pretend a word just ended at start)
       for i=1 to l do
           if wordend then
               -- eliminate any words that end before a wordstarts of {}.
               for j=length(wordstarts[i]) to 1 by -1 do
                   integer wl = length(wordstarts[i][j])
                   if i+wl<=l and wordstarts[i+wl]={} then
                       wordends[i+wl-1] -= 1
                       wordstarts[i][j..j] = {}
                       worthwhile = true
                   end if
               end for
           else
               -- elimitate all words that start here.
               for j=1 to length(wordstarts[i]) do
                   integer wl = length(wordstarts[i][j])
                   if i+wl<=l then
                       wordends[i+wl-1] -= 1
                       worthwhile = true
                   end if
               end for
               wordstarts[i] = {}
           end if
           wordend = wordends[i]
       end for
   end while

--?{wordstarts,wordends}

   if sum(wordends)=0 then
       printf(1,"%s: not possible",{s})
   else
       integer count = prrec(wordstarts,1,{},false)
       if count=1 then
           printf(1,"%s: 1 solution: %s\n",{s,join(flattens(wordstarts))})
       elsif count>20 then
           printf(1,"%s: %d solution(s): (too many to show)\n",{s,count})
           pp({wordstarts,wordends})
       else
           printf(1,"%s: %d solution(s):\n",{s,count})
           count = prrec(wordstarts,1,{},true)
       end if
   end if

end procedure

constant tests = {"abcd","abbc","abcbcd","acdbc","abcdd"} --constant tests = {"wordsisstringofspaceseparatedwords"} for i=1 to length(tests) do test(tests[i]) end for</lang>

Output:

{"a","bc","abc","cd","b"}
abcd: 1 solution: a b cd
abbc: 1 solution: a b bc
abcbcd: 2 solution(s):
{"a","bc","b","cd"}
{"abc","b","cd"}
acdbc: 1 solution: a cd bc
abcdd: not possible

PicoLisp

<lang PicoLisp>(setq *Dict (quote "a" "bc" "abc" "cd" "b")) (setq *Dict2

  (quote
     "mobile" "samsung" "sam" "sung" "man" "mango"
     "icecream" "and" "go" "i" "like" "ice" "cream" ) )

(de word (Str D)

  (let
     (Str (chop Str)
        Len (length Str)
        DP (need (inc Len))
        Res (need (inc Len))
        B 1 )
     (set DP 0)
     (map
        '((L)
           (and
              (get DP B)
              (for N (length L)
                 (let Str (pack (head N L))
                    (when (member Str D)
                       (set (nth Res (+ B N))
                          (copy (get Res B)) )
                       (queue (nth Res (+ B N)) Str)
                       (set (nth DP (+ B N))
                          (inc (get DP B)) ) ) ) ) )
           (inc 'B) )
        Str )
     (last Res) ) )

(println (word "abcd" *Dict)) (println (word "abbc" *Dict)) (println (word "abcbcd" *Dict)) (println (word "acdbc" *Dict)) (println (word "abcdd" *Dict)) (println (word "ilikesamsung" *Dict2)) (println (word "iii" *Dict2)) (println (word "ilikelikeimangoiii" *Dict2)) (println (word "samsungandmango" *Dict2)) (println (word "samsungandmangok" *Dict2)) (println (word "ksamsungandmango" *Dict2))</lang>

Output:

("a" "b" "cd")
("a" "b" "bc")
("a" "bc" "b" "cd")
("a" "cd" "bc")
NIL
("i" "like" "sam" "sung")
("i" "i" "i")
("i" "like" "like" "i" "man" "go" "i" "i" "i")
("sam" "sung" "and" "man" "go")
NIL
NIL

Racket

This returns all the possible splits (and null list if none is possible). Who's to say which is the best?

<lang racket>#lang racket

(define render-phrases pretty-print)

(define dict-1 (list "a" "bc" "abc" "cd" "b")) (define dict-2 (list "mobile" "samsung" "sam" "sung" "man" "mango"

                    "icecream" "and" "go" "i" "like" "ice" "cream"))

(define (word-splits str d)

 (let ((memo (make-hash)))
   (let inr ((s str))
     (hash-ref! memo s
                (λ () (append* (filter-map (λ (w)
                                             (and (string-prefix? s w)
                                                  (if (string=? w s)
                                                      (list s)
                                                      (map (λ (tl) (string-append w " " tl))
                                                           (inr (substring s (string-length w)))))))
                                           d)))))))

(module+ main

 (render-phrases (word-splits "abcd" dict-1))
 (render-phrases (word-splits "abbc" dict-1))
 (render-phrases (word-splits "abcbcd" dict-1))
 (render-phrases (word-splits "acdbc" dict-1))
 (render-phrases (word-splits "abcdd" dict-1))
 (render-phrases (word-splits "ilikesamsung" dict-2))
 (render-phrases (word-splits "iii" dict-2))
 (render-phrases (word-splits "ilikelikeimangoiii" dict-2))
 (render-phrases (word-splits "samsungandmango" dict-2))
 (render-phrases (word-splits "samsungandmangok" dict-2))
 (render-phrases (word-splits "ksamsungandmango" dict-2)))</lang>

Output:

'("a b cd")
'("a b bc")
'("a bc b cd" "abc b cd")
'("a cd bc")
'()
'("i like samsung" "i like sam sung")
'("i i i")
'("i like like i man go i i i" "i like like i mango i i i")
'("samsung and man go"
  "samsung and mango"
  "sam sung and man go"
  "sam sung and mango")
'()
'()

REXX

This REXX version allows the words to be tested (and the dictionary words) to be specified on the command line. <lang rexx>/*REXX program breaks up a word (or string) into a list of words from a dictionary. */ parse arg a '/' x; a=space(a); x=space(x) /*get optional args; elide extra blanks*/ if a== | a=="," then a= 'abcd abbc abcbcd acdbc abcdd' /*maybe use the defaults ? */ if x== | x=="," then x= 'a bc abc cd b' /* " " " " */ na=words(a) /*the number of words to be tested. */ nx=words(x) /* " " " " " the dictionary*/ say nx ' dictionary words: ' x /*display the words in the dictionary. */ say /*display a blank line to the terminal.*/ aw=0; do i=1 for na; _=word(a, i) /*obtain a word that will be tested. */

      aw=max(aw, length(_) )                    /*find widest width word being tested. */
      end   /*i*/                               /* [↑]  AW  is used to align the output*/

@.=0 /*initialize the dictionary to "null". */ xw=0; do i=1 for nx; _=word(x, i) /*obtain a word from the dictionary. */

      xw=max(xw, length(_) );  @._=1            /*find widest width dictionary word.   */
      end   /*i*/                               /* [↑]  define a dictionary word.      */

p=0 /* [↓] process a word in the A list.*/

      do j=1  for na;          yy=word(a, j)    /*YY:   test a word  from the  A  list.*/
        do t=(nx+1)**(xw+1)  by -1  to 1  until y==;  y=yy    /*try word possibility.*/
        $=                                      /*nullify the (possible) result list.  */
            do try=t  while y\=               /*keep testing until  Y  is exhausted. */
            p=(try + p)  // xw    + 1           /*use a possible width for this attempt*/
            p=fw(y, p); if p==0  then iterate t /*is this part of the word not found ? */
            $=$ ?                               /*It was found. Add partial to the list*/
            y=substr(y,  p + 1)                 /*now, use and test the rest of word.  */
            end   /*try*/
        end       /*t*/

      if t==0  then $= '(not possible)'         /*indicate that the word isn't possible*/
      say right(yy, aw)    '───►'    strip($)   /*display the result to the terminal.  */
      end   /*j*/

exit /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ fw: parse arg z,L; do k=L by -1 for L; ?=left(z,k); if @.? then leave; end; return k</lang>

output when using the default inputs:

5  dictionary words:  a bc abc cd b

  abcd ───► a b cd
  abbc ───► a b bc
abcbcd ───► abc b cd
 acdbc ───► a cd bc
 abcdd ───► (not possible)

Ring

Project : Word break problem
Date : 2018/02/09
Author : Gal Zsolt [~ CalmoSoft ~]
Email : <calmosoft@gmail.com>

load "stdlib.ring" list = ["a", "bc", "abc", "cd", "b"] inslist = list for n = 1 to len(inslist) - 1

     for m = len(inslist) to 1 step -1
           insert(list,0,inslist[m])
     next

next strings = ["abcd", "abbc", "abcbcd", "acdbc", "abcdd"] ind = len(list) items = newlist(pow(2,len(list))-1,ind) powerset(list,ind)

for p = 1 to len(strings)

     showarray(items,strings[p])

       num = 0
       num2 = 0
       items = newlist(pow(2,len(list))-1,2*ind)
       for i = 2 to (2 << len(list)) - 1 step 2
            num2 = 0
            num = num + 1
            for j = 1 to len(list) 
                 if i & (1 << j)
                     num2 = num2 + 1
                     if list[j] != 0
                       items[num][num2] = list[j]
                    ok
                 ok
            next
       next
       return items

func showarray(items,par)

       ready = []
       for n = 1 to len(items)
             for m = n + 1 to len(items) - 1
                   flag = 0
                   str = ""
                   for x = 1 to len(items[n])
                        if items[n][x] != 0  
                           str = str + items[n][x] + " "
                        ok
                   next 
                   str = left(str, len(str) - 1)
                   strsave = str
                   str = substr(str, " ", "") 
                   if str = par
                      pos = find(ready,strsave)               
                      if pos = 0              
                         add(ready,strsave)
                         flag = 1 
                         see par + " = " + strsave + nl
                     ok
                     if flag != 1 
                        del(items,m)
                     ok
                  ok
             next
       next

</lang> Output:

abcd = a b cd
abbc = a b bc
abcbcd = abc b cd
acdbc = a cd bc

Rust

Dynamic programming

<lang rust>use std::collections::HashSet; fn create_string(s: &str, v: Vec<Option<usize>>) -> String {

   let mut idx = s.len();
   let mut slice_vec = vec![];
   while let Some(prev) = v[idx] {
       slice_vec.push(&s[prev..idx]); 
       idx = prev;
   }
   slice_vec.reverse();
   slice_vec.join(" ")

}

fn word_break(s: &str, dict: HashSet<&str>) -> Option<String> {

   let size = s.len() + 1;
   let mut possible = vec![None; size];

   let check_word = |i,j| dict.get(&s[i..j]).map(|_| i);

   for i in 1..size {
       possible[i] = possible[i].or_else(|| check_word(0,i));

       if possible[i].is_some() {
           for j in i+1..size {
               possible[j] = possible[j].or_else(|| check_word(i,j));
           }

           if possible[s.len()].is_some() {
               return Some(create_string(s, possible));
           }

       };
   }
   None

}

fn main() {

   let mut set = HashSet::new();
   set.insert("a");
   set.insert("bc");
   set.insert("abc");
   set.insert("cd");
   set.insert("b");
   println!("{:?}", word_break("abcd", set).unwrap());

}</lang>

Output:

"a b cd"

zkl

<lang zkl>fcn wordBreak(str,words){ // words is string of space seperated words

  words=words.split(" ");	// to list of words
  r:=fcn(str,words,sink){	// recursive search, easy to collect answer
     foreach word in (words){

if(not str) return(True); // consumed string ie matched everything if(str.find(word)==0){ // word starts str, 0 so answer is ordered z:=word.len(); if(self.fcn(str.del(0,z),words,sink)) return(sink.write(word)); }

     }
     False		// can't make forward progress, back out & retry
  }(str,words,List());		// run the lambda
  if(False==r) return("not possible");
  r.reverse().concat(" ")

}</lang> <lang zkl>foreach text in (T("abcd","abbc","abcbcd","acdbc","abcdd")){

  println(text,": ",wordBreak(text,"a bc abc cd b"))

}</lang>

Output:

abcd: a b cd
abbc: a b bc
abcbcd: a bc b cd
acdbc: a cd bc
abcdd: not possible