Longest substrings without repeating characters

From Rosetta Code
Revision as of 19:53, 22 June 2021 by rosettacode>Gerard Schildberger (→‎{{header|REXX}}: matched an invocation to the procedure name.)
Longest substrings without repeating characters is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.
Task

Given a string,   find the   longest substring   without any repeating characters.


Other tasks related to string operations:
Metrics
Counting
Remove/replace
Anagrams/Derangements/shuffling
Find/Search/Determine
Formatting
Song lyrics/poems/Mad Libs/phrases
Tokenize
Sequences




C++

<lang cpp>#include <iostream>

  1. include <fstream>
  2. include <set>
  3. include <sstream>
  4. include <string>
  5. include <vector>

std::vector<std::string> longest_substrings_without_repeats(const std::string& str) {

   size_t max_length = 0;
   std::vector<std::string> result;
   size_t length = str.size();
   for (size_t offset = 0; offset < length; ++offset) {
       std::set<char> characters;
       size_t len = 0;
       for (; offset + len < length; ++len) {
           if (characters.find(str[offset + len]) != characters.end())
               break;
           characters.insert(str[offset + len]);
       }
       if (len > max_length) {
           result.clear();
           max_length = len;
       }
       if (len == max_length)
           result.push_back(str.substr(offset, max_length));
   }
   return result;

}

void print_strings(const std::vector<std::string>& strings) {

   std::cout << "[";
   for (size_t i = 0, n = strings.size(); i < n; ++i) {
       if (i > 0)
           std::cout << ", ";
       std::cout << '\ << strings[i] << '\;
   }
   std::cout << "]";

}

void test1() {

   for (std::string str : { "xyzyabcybdfd", "xyzyab", "zzzzz", "a", "thisisastringtest", "" }) {
       std::cout << "Input: '" << str << "'\nResult: ";
       print_strings(longest_substrings_without_repeats(str));
       std::cout << "\n\n";
   }

}

std::string slurp(std::istream& in) {

   std::ostringstream out;
   out << in.rdbuf();
   return out.str();

}

void test2(const std::string& filename) {

   std::ifstream in(filename);
   if (!in) {
       std::cerr << "Cannot open file '" << filename << "'.\n";
       return;
   }
   std::cout << "Longest substring with no repeats found in '" << filename << "': ";
   print_strings(longest_substrings_without_repeats(slurp(in)));
   std::cout << "\n";

}

int main() {

   test1();
   test2("unixdict.txt");

}</lang>

Output:
Input: 'xyzyabcybdfd'
Result: ['zyabc', 'cybdf']

Input: 'xyzyab'
Result: ['zyab']

Input: 'zzzzz'
Result: ['z', 'z', 'z', 'z', 'z']

Input: 'a'
Result: ['a']

Input: 'thisisastringtest'
Result: ['astring', 'ringtes']

Input: ''
Result: []

Longest substring with no repeats found in 'unixdict.txt': ['mbowel
disgrunt']

Julia

Works on any array, treating a character string as a character array. <lang julia>function alluniquehead(arr)

   len = length(arr)
   if len > 1
       for i in 2:len
           if arr[i] in @view arr[1:i-1]
               return arr[1:i-1]
           end
       end
   end
   return arr[:]

end

function maxuniques(arr)

   length(arr) < 2 && return arr
   amax = [alluniquehead(@view arr[i:end]) for i in 1:length(arr)]
   maxlen = maximum(map(length, amax))
   return filter(a -> length(a) == maxlen, amax)

end

for s in ["xyzyabcybdfd", "xyzyab", "zzzzz", "a", "α⊆϶α϶", "", [1, 2, 3, 4, 1, 2, 5, 6, 1, 7, 8, 1, 0]]

   uarray = maxuniques(collect(s))
   !isempty(uarray) && length(uarray[1]) > 1 && uarray[1][1] isa Char && (uarray = String.(uarray))
   println("\"$s\" => ", uarray)

end

</lang>
Output:
"xyzyabcybdfd" => ["zyabc", "cybdf"]
"xyzyab" => ["zyab"]
"zzzzz" => [['z'], ['z'], ['z'], ['z'], ['z']]
"a" => ['a']
"α⊆϶α϶" => ["α⊆϶", "⊆϶α"]
"" => Char[]
"[1, 2, 3, 4, 1, 2, 5, 6, 1, 7, 8, 1, 0]" => [[3, 4, 1, 2, 5, 6], [2, 5, 6, 1, 7, 8]]

Nim

This version converts strings to sequences of runes. <lang Nim>import sequtils, strutils, unicode

type Runes = seq[Rune]

func longestSubstrings(str: string): seq[string] =

 var runes = str.toRunes
 var current: Runes
 var res: seq[Runes]   # Result as a list of Runes.
 var maxlen = 0
 for c in runes:
   let idx = current.find(c)
   if idx >= 0:
     if current.len > maxlen:
       res = @[current]
       maxlen = current.len
     elif current.len == maxlen and current notin res:
       res.add current
     current.delete(0, idx)
   current.add c
 # Process the last current string.
 if current.len > maxlen: res = @[current]
 elif current.len == maxlen and current notin res: res.add current
 result = res.mapIt($it)


for str in ["xyzyabcybdfd", "xyzyab", "zzzzz", "a", "α⊆϶α϶", ""]:

 echo '"', str, '"', " → ", longestSubstrings(str)

echo "\nLongest substrings in concatenated list of words from “unixdict.txt”: ",

    longestSubstrings(toSeq("unixdict.txt".lines).join())</lang>
Output:
"xyzyabcybdfd" → @["zyabc", "cybdf"]
"xyzyab" → @["zyab"]
"zzzzz" → @["z"]
"a" → @["a"]
"α⊆϶α϶" → @["α⊆϶", "⊆϶α"]
"" → @[""]

Longest substrings in concatenated list of words from “unixdict.txt”: @["mboweldisgrunt"]

Perl

Gets the same answer that raku does when run against unixdict.txt <lang perl>#!/usr/bin/perl

use strict; # Longest_substrings_without_repeating_characters use warnings;

for my $string ( qw( xyzyabcybdfd xyzyab zzzzz a thisisastringtest ) )

 {
 local $_ = $string;
 my @sub;
 length $+ >= $#sub and ++$sub[length $+]{$+} while s/.*(.)(.*\K\1.*)|(.+)//s;
 printf "%20s -> %s\n", $string, join ' ', sort keys %{ pop @sub };
 }</lang>
Output:
        xyzyabcybdfd -> cybdf zyabc
              xyzyab -> zyab
               zzzzz -> z
                   a -> a
   thisisastringtest -> astring ringtes

Phix

Single pass, maintains a table of seen characters and a start position.
If the current character occurs in start..i we trim left until cleared, otherwise/then we add any longer/equal len start..i to the result set.
Should be exponentially faster than collecting all possible substrings and then eliminating those with duplicate characters or too short.
It will however collect duplicates (when long enough) before weeding them out at the end.

with javascript_semantics

function longest_substrings(sequence s)
    --
    -- s can be a normal 8-bit ansi string or a
    -- 32-bit unicode sequence from eg utf8_to_utf32().
    -- it will not complain if given utf-8, but it may
    -- chop up multi-byte characters inappropriately.
    -- s must not contain any 0 or non-integer elements.
    --
    sequence res = {},
             found = repeat(false,256) -- (for ansi)
    integer start = 1,
            maxlen = 0
    for i=1 to length(s) do
        integer si = s[i]   -- (deliberate typecheck)
        if si>length(found) then    -- (for unicode)
            found &= repeat(false,si-length(found))
        elsif found[si] then
            while true do
                integer ss = s[start]
                start += 1
                found[ss] = false
                if ss = si then exit end if
            end while
            -- aside: we will not have found anything
            -- longer, but may have trimmed 1 added 1
            -- and therefore still be === maxlen.
        end if
        found[si] = true
        integer newlen = i-start+1
        if newlen>maxlen then
            res = {}
            maxlen = newlen
        end if
        if newlen=maxlen then
            res = append(res,s[start..i])
        end if
    end for
    res = unique(res,"STABLE")
    return res
end function

?longest_substrings("xyzyabcybdfd")
?longest_substrings("xyzyab")
?longest_substrings("zzzzz")
?longest_substrings("a")
?longest_substrings("")
Output:
{"zyabc","cybdf"}
{"zyab"}
{"z"}
{"a"}
{}

A quick test on unixdict.txt yielded the same results as Raku.

Python

The following algorithm works but is not terribly efficient for long strings. <lang python>def longest_substring(s = "xyzyab"):

   substr = [s[x:y] for x in range(len(s)) for y in range(x+1, len(s) + 1)]
   no_reps = []
   for sub in substr:
       if len(sub) == len(set(sub)) and sub not in no_reps:
           no_reps.append(sub)
   max_len = max(len(sub) for sub in no_reps) if no_reps else 0
   max_subs = [sub for sub in no_reps if len(sub) == max_len]
   return max_subs

if __name__ == '__main__':

   for s in ["xyzyabcybdfd", "xyzyab", "zzzzz", "a", "α⊆϶α϶", "",
             [1, 2, 3, 4, 1, 2, 5, 6, 1, 7, 8, 1, 0]]:
       print(f"{s} => {longest_substring(s)}")</lang>
Output:
xyzyabcybdfd => ['zyabc', 'cybdf']
xyzyab => ['zyab']
zzzzz => ['z']
a => ['a']
α⊆϶α϶ => ['α⊆϶', '⊆϶α']
 => []
[1, 2, 3, 4, 1, 2, 5, 6, 1, 7, 8, 1, 0] => [[3, 4, 1, 2, 5, 6], [2, 5, 6, 1, 7, 8]]

Python: Some optimisation

The following algorithm only accrues the longest so far. <lang python>def longest_substring2(s = "xyzyab"):

   max_subs, mx = [], 0
   for x in range(len(s)):
       for y in range(x+1, len(s) + 1):
           sub = s[x:y]
           if y - x >= mx and len(sub) == len(set(sub)):
               if y - x == mx and sub not in max_subs:
                   max_subs.append(sub)
               else:
                   max_subs, mx = [sub], y - x
   return max_subs</lang>
Output:

It gives the same output as function longest_substring() above.

Raku

Detects any character when checking for repeated characters, even multi-byte composite characters, control sequences and whitespace.

Note that there is no requirement that the substrings be unique, only that each has no repeated characters internally.

Not going to bother handling arrays since an array is not a string, and the task description specifically says 'Given a string'. <lang perl6>sub abbreviate ($_) { .chars > 80 ?? "(abbreviated)\n" ~ .substr(0,35) ~ ' ... ' ~ .substr(*-35) !! $_ }

sub longest ($string) {

  return 0 => [] unless $string.chars;
  my ($start, $end, @substr) = 0, 0;
  while ++$end < $string.chars {
      my $sub = $string.substr($start, $end - $start);
      if $sub.contains: my $next = $string.substr: $end, 1 {
          @substr[$end - $start].push($sub) if $end - $start >= @substr.end;
          $start += 1 + $sub.index($next);
      }
  }
  @substr[$end - $start].push: $string.substr($start);
  @substr.pairs.tail

}

  1. Testing

say "\nOriginal string: {abbreviate $_}\nLongest substring(s) chars: ", .&longest

  1. Standard tests

for flat qww< xyzyabcybdfd xyzyab zzzzz a >,

  1. multibyte Unicode

< 👒🎩🎓👩‍👩‍👦‍👦🧢🎓👨‍👧‍👧👒👩‍👩‍👦‍👦🎩🎓👒🧢 α⊆϶α϶ >,

  1. check a file

slurp 'unixdict.txt';</lang>

Output:
Original string: xyzyabcybdfd
Longest substring(s) chars: 5 => [zyabc cybdf]

Original string: xyzyab
Longest substring(s) chars: 4 => [zyab]

Original string: zzzzz
Longest substring(s) chars: 1 => [z z z z z]

Original string: a
Longest substring(s) chars: 1 => [a]

Original string: 
Longest substring(s) chars: 0 => []

Original string: 👒🎩🎓👩‍👩‍👦‍👦🧢🎓👨‍👧‍👧👒👩‍👩‍👦‍👦🎩🎓👒🧢
Longest substring(s) chars: 6 => [🧢🎓👨‍👧‍👧👒👩‍👩‍👦‍👦🎩]

Original string: α⊆϶α϶
Longest substring(s) chars: 3 => [α⊆϶ ⊆϶α]

Original string: (abbreviated)
10th
1st
2nd
3rd
4th
5th
6th
7th
8t ... rian
zounds
zucchini
zurich
zygote

Longest substring(s) chars: 15 => [mbowel
disgrunt]

REXX

<lang rexx>/*REXX pgm finds the longest substring (in a given string) without a repeating character*/ parse arg $ /*obtain optional argument from the CL.*/ if $== | $=="," then $= 'xyzyabcybdfd' /*Not specified? Then use the default.*/ L= length($) /*L: the length of the given string.*/ maxL= 0 /*MAXL: max length substring (so far).*/ @.= /*array to hold the max len substrings.*/

    do   j=1  for L;    b= substr($, j, 1)      /*search for max length substrings.    */
    x=                                          /*X:   the substring, less the 1st char*/
       do k=j+1  to L;  x= x || substr($, k, 1) /*search for the max length substrings.*/
       if \OKx(x)  then iterate j               /*Are there an replications?  Skip it. */
       _= length(x);  if _<maxL  then iterate   /*is length less then the current max? */
       @._= @._  x;   maxL= _                   /*add this substring to the max list.  */
       end   /*k*/
    end      /*j*/

say 'longest substring's(words(@.maxL)) "in " $ ' ───► ' @.maxL exit 0 /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ s: if arg(1)==1 then return arg(3); return word(arg(2) 's', 1) /*simple pluralizer*/ /*──────────────────────────────────────────────────────────────────────────────────────*/ OKx: procedure; parse arg y; do r=1 for length(y)-1 /*look for duplicate chars.*/

                                  if pos(substr(y, r, 1), y, r+1)>0  then return 0
                                  end   /*r*/;                            return 1</lang>
output   when using the default input:
longest substrings in  xyzyabcybdfd  ───►   zyabc cybdf

Ring

<lang ring> see "working..." + nl see "Longest substrings without repeating characters are:" + nl str = "xyzyabcybdfd" see "Input: " + str + nl lenStr = len(str) strOk = [] lenOk = 0

for n = 1 to lenStr

   for m = 1 to lenStr-n+1
       temp = substr(str,n,m)
       add(strOk,temp)
   next

next

for n = len(strOk) to 1 step -1

   if len(strOK[n]) = 1
      del(strOK,n)
   ok

next

for n = 1 to len(strOK)

   for m = 1 to len(strOK)-1
       if len(strOK[m+1]) > len(strOK[m])
          temp = strOK[m]
          strOK[m] = strOK[m+1]
          strOK[m+1] = temp
       ok
   next

next

for n = 1 to len(strOK)

   flag = 1
   for m = 1 to len(strOK[n])
       for p = m+1 to len(strOK[n])
           if strOK[n][m] = strOK[n][p]
              flag = 0
              exit
           ok
       next
   next
   if flag = 1
      if len(strOK[n]) >= lenOk
         see "len = " + len(strOK[n]) + " : " + strOK[n] + nl
         lenOK = len(strOK[n])
      ok
   ok

next

see "done..." + nl </lang>

Output:
working...
Longest substrings without repeating characters are:
Input: xyzyabcybdfd
len = 5 : zyabc
len = 5 : cybdf
done...

Wren

Library: Wren-seq

<lang ecmascript>import "/seq" for Lst

var substrings = Fn.new { |s|

   var n = s.count
   if (n == 0) return [""]
   var ss = []
   for (i in 0...n) {
       for (len in 1..n-i) ss.add(s[i...i+len])
   }
   return ss

}

System.print("The longest substring(s) of the following without repeating characters are:\n") var strs = ["xyzyabcybdfd", "xyzyab", "zzzzz", "a", "", ] for (s in strs) {

   var longest = []
   var longestLen = 0
   for (ss in substrings.call(s)) {
       if (Lst.distinct(ss.toList).count == ss.count) { 
           if (ss.count >= longestLen) {
               if (ss.count > longestLen) {
                   longest.clear()
                   longestLen = ss.count
               }
               longest.add(ss.join())
           }
       }
   }
   longest = Lst.distinct(longest)
   System.print("String = '%(s)'")
   System.print(longest)
   System.print("Length = %(longest[0].count)\n")

}</lang>

Output:
The longest substring(s) of the following without repeating characters are:

String = 'xyzyabcybdfd'
[zyabc, cybdf]
Length = 5

String = 'xyzyab'
[zyab]
Length = 4

String = 'zzzzz'
[z]
Length = 1

String = 'a'
[a]
Length = 1

String = ''
[]
Length = 0