Word break problem: Difference between revisions

Content added Content deleted

Inline

Revision as of 19:08, 13 July 2017

Task: Given an input string and a dictionary of words, segment the input string into a space-separated sequence of dictionary words if possible.

Aime

<lang aime>integer wordbreak(record dict, text phrase, integer p, list words) {

   integer complete, n;
   text s;

   complete = 0;
   if (rsk_lower(dict, phrase, s)) {
       if (s == phrase) {
           l_append(words, s);
           complete = 1;
       } else {
           do {
               n = 0;
               while (phrase[n] == s[n]) {
                   n += 1;
               }
               if (!n) {
                   break;
               }
               l_append(words, cut(s, 0, n));
               complete = wordbreak(dict, project(phrase, n), p + 1, words);
               if (complete) {
                   break;
               }
               l_delete(words, -1);
           } while (rsk_less(dict, s, s));
       }
   }

   if (!p) {
       o_(phrase, ":");
       if (complete) {
           l_ucall(words, o_, 1, " ");
       } else {
           o_(" can't break");
       }
       o_newline();

       l_clear(words);
   }

   return complete;

}

integer main(void) {

   record dict;

   r_fit(dict, "a", 0, "bc", 0, "abc", 0, "cd", 0, "b", 0);
   l_ucall(l_effect("abcd", "abbc", "abcbcd", "acdbc", "abcdd"), wordbreak, 1, dict, 0, l_effect());

   return 0;

}</lang>

Output:

abcd: a b cd
abbc: a b bc
abcbcd: abc b cd
acdbc: a cd bc
abcdd: can't break

Perl 6

Works with: Rakudo version 2017.04

This implementation does not necessarily find every combination, it returns the one with the longest matching tokens.

<lang perl6>my @words = <a bc abc cd b>; my $regex = @words.join('|');

put "$_: ", word-break($_) for <abcd abbc abcbcd acdbc abcdd>;

sub word-break (Str $word) { ($word ~~ / ^ (<$regex>)+ $ /)[0] // "Not possible" }</lang>

Output:

abcd: a b cd
abbc: a b bc
abcbcd: abc b cd
acdbc: a cd bc
abcdd: Not possible

Phix

See talk page <lang Phix>procedure populate_dict(sequence s) ?s

   for i=1 to length(s) do setd(s[i],0) end for

end procedure

--/* function valid_word(string word)

   if length(word)=1 then return find(word,{"a","i"})!=0 end if
   if find(word,{"sis","sst","se"}) then return false end if  -- hack
   for i=1 to length(word) do
       integer ch = word[i]
       if ch<'a'
       or ch>'z' then
           return false
       end if
   end for
   return true

end function --*/

populate_dict(split("a bc abc cd b")) --/* integer fn = open("demo\\unixdict.txt","r") sequence words = get_text(fn,GT_LF_STRIPPED) close(fn) for i=length(words) to 1 by -1 do

   if not valid_word(words[i]) then
       words[i] = words[$]
       words = words[1..$-1]
   end if

end for populate_dict(words) --*/

function prrec(sequence wordstarts, integer idx, sequence sofar, bool show)

   if idx>length(wordstarts) then
       if show then
           ?sofar
       end if
       return 1
   end if
   integer res = 0
   for i=1 to length(wordstarts[idx]) do
       string w = wordstarts[idx][i]
       res += prrec(wordstarts,idx+length(w),append(sofar,w),show)
   end for
   return res

end function

function flattens(sequence s) -- remove all nesting and empty sequences from a nested sequence of strings sequence res = {}, si

   for i=1 to length(s) do
       si = s[i]
       if string(si) then
           res = append(res,si)
       else
           res &= flattens(si)
       end if
   end for
   return res

end function

procedure test(string s) integer l = length(s) sequence wordstarts = repeat({},l), wordends = repeat(0,l) integer wordend = 1 -- (pretend a word just ended at start)

   for i=1 to l do
       if wordend then
           for j=i to l do
               object pkey = getd_partial_key(s[i..j])
               if string(pkey) and length(pkey)>j-i and s[i..j]=pkey[1..j-i+1] then
                   if length(pkey)=j-i+1 then
                       -- exact match
                       wordstarts[i] = append(wordstarts[i],pkey)
                       wordends[j] += 1
                   end if
               else
                   exit
               end if  
           end for
       end if
       wordend = wordends[i]
   end for
   bool worthwhile = true
   while worthwhile do
       worthwhile = false
       wordend = 1 -- (pretend a word just ended at start)
       for i=1 to l do
           if wordend then
               -- eliminate any words that end before a wordstarts of {}.
               for j=length(wordstarts[i]) to 1 by -1 do
                   integer wl = length(wordstarts[i][j])
                   if i+wl<=l and wordstarts[i+wl]={} then
                       wordends[i+wl-1] -= 1
                       wordstarts[i][j..j] = {}
                       worthwhile = true
                   end if
               end for
           else
               -- elimitate all words that start here.
               for j=1 to length(wordstarts[i]) do
                   integer wl = length(wordstarts[i][j])
                   if i+wl<=l then
                       wordends[i+wl-1] -= 1
                       worthwhile = true
                   end if
               end for
               wordstarts[i] = {}
           end if
           wordend = wordends[i]
       end for
   end while

--?{wordstarts,wordends}

   if sum(wordends)=0 then
       printf(1,"%s: not possible",{s})
   else
       integer count = prrec(wordstarts,1,{},false)
       if count=1 then
           printf(1,"%s: 1 solution: %s\n",{s,join(flattens(wordstarts))})
       elsif count>20 then
           printf(1,"%s: %d solution(s): (too many to show)\n",{s,count})
           pp({wordstarts,wordends})
       else
           printf(1,"%s: %d solution(s):\n",{s,count})
           count = prrec(wordstarts,1,{},true)
       end if
   end if

end procedure

constant tests = {"abcd","abbc","abcbcd","acdbc","abcdd"} --constant tests = {"wordsisstringofspaceseparatedwords"} for i=1 to length(tests) do test(tests[i]) end for</lang>

Output:

{"a","bc","abc","cd","b"}
abcd: 1 solution: a b cd
abbc: 1 solution: a b bc
abcbcd: 2 solution(s):
{"a","bc","b","cd"}
{"abc","b","cd"}
acdbc: 1 solution: a cd bc
abcdd: not possible

This REXX version allows the words to be tested (and the dictionary words) to be specified on the command line. <lang rexx>/*REXX program breaks up a word (or string) into a list of words from a dictionary. */ parse arg a '/' x; $=space($); x=space(x) /*get optional args; elide extra blanks*/ if a== | a=="," then a='abcd abbc abcbcd acdbc abcdd' /*maybe use the defaults ? */ if x== | x=="," then x='a bc abc cd b' /* " " " " */ na=words(a) /*the number of words to be tested. */ nx=words(x) /* " " " " " the dictionary*/ say nx 'dictionary words:' x /*display the words in the dictionary. */ say /*display a blank line to the terminal.*/ aw=0; do i=1 for na; _=word(a,i) /*obtain a word that will be tested. */

      aw=max(aw,length(_))                      /*find widest width word being tested. */
      end   /*i*/                               /* [↑]  AW  is used to align the output*/

@.=0 /*initialize the dictionary to "null". */ xw=0; do i=1 for nx; _=word(x,i) /*obtain a word from the dictionary. */

      xw=max(xw,length(_));    @._=1            /*find widest width dictionary word.   */
      end   /*i*/                               /* [↑]  define a dictionary word.      */

p=0 /* [↓] process a word in the A list.*/

      do j=1  for na;          yy=word(a,j)     /*YY:   test a word  from the  A  list.*/
        do t=(nx+1)**(xw+1) by -1 to 1  until y==;  y=yy    /*try a word possibility.*/
        $=                                      /*nullify the (possible) result list.  */
            do try=t  while y\=               /*keep testing until  Y  is exhausted. */
            p=(try+p)//xw+1                     /*use a possible width for this attempt*/
            p=fw(y,p);  if p==0  then iterate t /*is this part of the word not found ? */
            $=$ ?                               /*It was found. Add partial to the list*/
            y=substr(y,p+1)                     /*now, use and test the rest of word.  */
            end   /*try*/
        end       /*t*/

      if t==0  then $= '(not possible)'         /*indicate that the word isn't possible*/
      say right(yy,aw)  '───►'  strip($)        /*display the result to the terminal.  */
      end   /*j*/

exit /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ fw: parse arg z,L; do k=L by -1 for L; ?=left(z,k); if @.? then leave; end; return k</lang>

output when using the default inputs:

5 dictionary words: a bc abc cd b

  abcd ───► a b cd
  abbc ───► a b bc
abcbcd ───► abc b cd
 acdbc ───► a cd bc
 abcdd ───► (not possible)

Rust

Dynamic programming

<lang rust>use std::collections::HashSet; fn create_string(s: &str, v: Vec<Option<usize>>) -> String {

   let mut idx = s.len();
   let mut slice_vec = vec![];
   while let Some(prev) = v[idx] {
       slice_vec.push(&s[prev..idx]); 
       idx = prev;
   }
   slice_vec.reverse();
   slice_vec.join(" ")

}

fn word_break(s: &str, dict: HashSet<&str>) -> Option<String> {

   let size = s.len() + 1;
   let mut possible = vec![None; size];

   let check_word = |i,j| dict.get(&s[i..j]).map(|_| i);

   for i in 1..size {
       possible[i] = possible[i].or_else(|| check_word(0,i));

       if possible[i].is_some() {
           for j in i+1..size {
               possible[j] = possible[j].or_else(|| check_word(i,j));
           }

           if possible[s.len()].is_some() {
               return Some(create_string(s, possible));
           }

       };
   }
   None

}

fn main() {

   let mut set = HashSet::new();
   set.insert("a");
   set.insert("bc");
   set.insert("abc");
   set.insert("cd");
   set.insert("b");
   println!("{:?}", word_break("abcd", set).unwrap());

}</lang>

Output:

"a b cd"

zkl

<lang zkl>fcn wordBreak(str,words){ // words is string of space seperated words

  words=words.split(" ");	// to list of words
  r:=fcn(str,words,sink){	// recursive search, easy to collect answer
     foreach word in (words){

if(not str) return(True); // consumed string ie matched everything if(str.find(word)==0){ // word starts str, 0 so answer is ordered z:=word.len(); if(self.fcn(str.del(0,z),words,sink)) return(sink.write(word)); }

     }
     False		// can't make forward progress, back out & retry
  }(str,words,List());		// run the lambda
  if(False==r) return("not possible");
  r.reverse().concat(" ")

}</lang> <lang zkl>foreach text in (T("abcd","abbc","abcbcd","acdbc","abcdd")){

  println(text,": ",wordBreak(text,"a bc abc cd b"))

}</lang>

Output:

abcd: a b cd
abbc: a b bc
abcbcd: a bc b cd
acdbc: a cd bc
abcdd: not possible