Longest Common Substring

From Rosetta Code
Longest Common Substring is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.
Write a function returns the longest common substring of two strings. Use it within a program that demonstrates sample output from the function, which will consist of the longest common substring between "thisisatest" and "testing123testing". Note that substrings are consecutive characters within a string. This distinguishes them from subsequences, which is any sequence of characters within a string, even if there are extraneous characters in between them. Hence, the longest common subsequence between "thisisatest" and "testing123testing" is "tsitest", whereas the longest common substring is just "test".

References:

Metrics: length

Sub-string search: Count occurrences of a substring

Multi-string operations: LCP, LCS, concatenation

Manipulation: reverse, lower- and uppercase

Aime[edit]

void
test_string(text &g, text v, text l)
{
integer n;
 
n = 0;
while (l[n] && v[n] == l[n]) {
n += 1;
}
if (length(g) < n) {
g = cut(l, 0, n);
}
}
 
text
longest(text u, text v)
{
record r;
text g, l, s;
 
while (length(u)) {
r[u] = 0;
u = delete(u, 0);
}
while (length(v)) {
if (rsk_lower(r, v, l)) {
test_string(g, v, l);
}
if (rsk_upper(r, v, l)) {
test_string(g, v, l);
}
v = delete(v, 0);
}
 
return g;
}
o_(longest("thisisatest", "testing123testing"), "\n");

AutoHotkey[edit]

Using Text Comparison[edit]

LCS(a, b){
x := i := 1
while StrLen(x)
Loop % StrLen(a)
IfInString, b, % x := SubStr(a, i:=StrLen(x)=1 ? i+1 : i, n:=StrLen(a)+1-A_Index)
res := StrLen(res) > StrLen(x) ? res : x
return res
}
Examples:
MsgBox % LCS("thisisatest", "testing123testing")
Outputs:
test

Using RegEx[edit]

LCS(a, b){
while pos := RegExMatch(a "`n" b, "(.+)(?=.*\R.*\1)", m, pos?pos+StrLen(m):1)
res := StrLen(res) > StrLen(m1) ? res : m1
return res
}
Examples:
MsgBox % LCS("thisisatest", "testing123testing")
Outputs:
test


C++[edit]

#include <string>
#include <algorithm>
#include <iostream>
#include <set>
#include <vector>
 
void findSubstrings ( const std::string & word , std::set<std::string> & substrings ) {
int l = word.length( ) ;
for ( int start = 0 ; start < l ; start++ ) {
for ( int length = 1 ; length < l - start + 1 ; length++ ) {
substrings.insert ( word.substr( start , length ) ) ;
}
}
}
 
std::string lcs ( const std::string & first , const std::string & second ) {
std::set<std::string> firstSubstrings , secondSubstrings ;
findSubstrings ( first , firstSubstrings ) ;
findSubstrings ( second , secondSubstrings ) ;
std::set<std::string> common ;
std::set_intersection ( firstSubstrings.begin( ) , firstSubstrings.end( ) ,
secondSubstrings.begin( ) , secondSubstrings.end( ) ,
std::inserter ( common , common.begin( ) ) ) ;
std::vector<std::string> commonSubs ( common.begin( ) , common.end( ) ) ;
std::sort ( commonSubs.begin( ) , commonSubs.end( ) , []( const std::string &s1 ,
const std::string &s2 ) { return s1.length( ) > s2.length( ) ; } ) ;
return *(commonSubs.begin( ) ) ;
}
 
int main( ) {
std::string s1 ("thisisatest" ) ;
std::string s2 ( "testing123testing" ) ;
std::cout << "The longest common substring of " << s1 << " and " << s2 << " is:\n" ;
std::cout << lcs ( s1 , s2 ) << " !\n" ;
return 0 ;
}
Output:
The longest common substring of thisisatest and testing123testing is:
test !

C#[edit]

Using dynamic programming[edit]

using System;
 
namespace LongestCommonSubstring
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine(lcs("thisisatest", "testing123testing"));
Console.ReadKey(true);
}
 
public static string lcs(string a, string b)
{
var lengths = new int[a.Length, b.Length];
int greatestLength = 0;
string output = "";
for (int i = 0; i < a.Length; i++)
{
for (int j = 0; j < b.Length; j++)
{
if (a[i] == b[j])
{
lengths[i, j] = i == 0 || j == 0 ? 1 : lengths[i - 1, j - 1] + 1;
if (lengths[i, j] > greatestLength)
{
greatestLength = lengths[i, j];
output = a.Substring(i - greatestLength + 1, greatestLength);
}
}
else
{
lengths[i, j] = 0;
}
}
}
return output;
}
}
}
Output:
test

Searching for smaller substrings of a in b[edit]

Translation of: REXX
//C# program tests the LCSUBSTR (Longest Common Substring) subroutine.
using System;
namespace LongestCommonSubstring
{
class Program
{
static void Main(string[] args)
{
string a = args.Length >= 1 ? args[0] : ""; /*get two arguments (strings). */
string b = args.Length == 2 ? args[1] : "";
if (a == "") a = "thisisatest"; /*use this string for a default. */
if (b == "") b = "testing123testing"; /* " " " " " " */
Console.WriteLine("string A = {0}", a); /*echo string A to screen. */
Console.WriteLine("string B = {0}", b); /*echo string B to screen. */
Console.WriteLine("LCsubstr = {0}", LCsubstr(a, b)); /*tell Longest Common Substring. */
Console.ReadKey(true);
} /*stick a fork in it, we're done.*/
 
/*─────────────────────────────────LCSUBSTR subroutine─────────────────────────────────*/
public static string LCsubstr(string x, string y) /*Longest Common Substring. */
{
string output = "";
int lenx = x.Length; /*shortcut for using the X length*/
for (int j = 0; j < lenx; j++) /*step through start points in X.*/
{
for (int k = lenx - j; k > -1; k--) /*step through string lengths. */
{
string common = x.Substring(j, k); /*extract a common substring. */
if (y.IndexOf(common) > -1 && common.Length > output.Length) output = common; /*longest?*/
} /*k*/
} /*j*/
return output; /*$ is "" if no common string. */
}
}
}

output when using the default inputs:

   string A = thisisatest
   string B = testing123testing
   LCsubstr = test

Searching for smaller substrings of a in b (simplified)[edit]

Translation of: zkl
//C# program tests the LCS (Longest Common Substring) subroutine.
using System;
namespace LongestCommonSubstring
{
class Program
{
static void Main(string[] args)
{
string a = args.Length >= 1 ? args[0] : ""; /*get two arguments (strings). */
string b = args.Length == 2 ? args[1] : "";
if (a == "") a = "thisisatest"; /*use this string for a default. */
if (b == "") b = "testing123testing"; /* " " " " " " */
Console.WriteLine("string A = {0}", a); /*echo string A to screen. */
Console.WriteLine("string B = {0}", b); /*echo string B to screen. */
Console.WriteLine("LCS = {0}", lcs(a, b)); /*tell Longest Common Substring. */
Console.ReadKey(true);
} /*stick a fork in it, we're done.*/
 
/*─────────────────────────────────LCS subroutine─────────────────────────────────*/
private static string lcs(string a, string b)
{
if(b.Length<a.Length){ string t=a; a=b; b=t; }
for (int n = a.Length; n > 0; n--)
{
for (int m = a.Length-n; m <= a.Length-n; m++)
{
string s=a.Substring(m,n);
if(b.Contains(s)) return(s);
}
}
return "";
}
}
 

output when using the default inputs:

   string A = thisisatest
   string B = testing123testing
   LCS = test

Elixir[edit]

Works with: Elixir version 1.3
defmodule LCS do
def longest_common_substring(a,b) do
alist = to_charlist(a) |> Enum.with_index
blist = to_charlist(b) |> Enum.with_index
lengths = for i <- 0..length(alist)-1, j <- 0..length(blist), into: %{}, do: {{i,j},0}
Enum.reduce(alist, {lengths,0,""}, fn {x,i},acc ->
Enum.reduce(blist, acc, fn {y,j},{map,gleng,lcs} ->
if x==y do
len = if i==0 or j==0, do: 1, else: map[{i-1,j-1}]+1
map = %{map | {i,j} => len}
if len > gleng, do: {map, len, String.slice(a, i - len + 1, len)},
else: {map, gleng, lcs}
else
{map, gleng, lcs}
end
end)
end)
|> elem(2)
end
end
 
IO.puts LCS.longest_common_substring("thisisatest", "testing123testing")
Output:
test

Go[edit]

Translation of: C#
package main
 
import "fmt"
 
func lcs(a, b string) (output string) {
lengths := make([]int, len(a)*len(b))
greatestLength := 0
for i, x := range a {
for j, y := range b {
if x == y {
if i == 0 || j == 0 {
lengths[i*len(b)+j] = 1
} else {
lengths[i*len(b)+j] = lengths[(i-1)*len(b)+j-1] + 1
}
if lengths[i*len(b)+j] > greatestLength {
greatestLength = lengths[i*len(b)+j]
output = a[i-greatestLength+1 : i+1]
}
}
}
}
return
}
 
func main() {
fmt.Println(lcs("thisisatest", "testing123testing"))
}
Output:
test

Haskell[edit]

import Data.Ord (comparing)
import Data.List (maximumBy, intersect)
 
subStrings :: String -> [String]
subStrings s =
let intChars = length s
in [ take n $ drop i s
| i <- [0 .. intChars - 1]
, n <- [1 .. intChars - i] ]
 
longestCommon :: String -> String -> String
longestCommon a b =
maximumBy (comparing length) (subStrings a `intersect` subStrings b)
 
main :: IO ()
main = putStrLn $ longestCommon "testing123testing" "thisisatest"
Output:
test

J[edit]

This algorithm starts by comparing each character in the one string to each character in the other, and then iterates on this result until it finds the length of the longest common substring. So if Lx is the length of one argument string, Ly is the length of the other argument string, and Lr is the length of the result string, this algorithm uses space on the order of Lx*Ly and time on the order of Lx*Ly*Lr.

In other words: this can be suitable for small problems, but you might want something better if you're comparing gigabyte length strings with high commonality.

lcstr=:4 :0
C=. ({.~ 1+$) x=/y
M=. >./ (* * * >. * + (_1&|.)@:|:^:2)^:_ C
N=. >./ M
y {~ (M i. N)-i.-N
)

Intermedate results:

   C shows which characters are in common between the two strings.
   M marks the length of the longest common substring ending at each position in the right argument
   N is the length of the longest common substring

Example use:

   'thisisatest' lcs 'testing123testing'
test

Java[edit]

public class LongestCommonSubstring {
 
public static void main(String[] args) {
System.out.println(lcs("testing123testing", "thisisatest"));
}
 
static String lcs(String a, String b) {
if (a.length() > b.length())
return lcs(b, a);
 
String res = "";
for (int ai = 0; ai < a.length(); ai++) {
for (int len = a.length() - ai; len > 0; len--) {
 
for (int bi = 0; bi < b.length() - len; bi++) {
 
if (a.regionMatches(ai, b, bi, len) && len > res.length()) {
res = a.substring(ai, ai + len);
}
}
}
}
return res;
}
}
test

jq[edit]

Translation of: C#, Go, Ruby
Works with: jq version 1.4

Utility functions:

# Create an m x n matrix
def matrix(m; n; init):
if m == 0 then []
elif m == 1 then [range(0;n) | init]
elif m > 0 then
matrix(1;n;init) as $row
| [range(0;m) | $row ]
else error("matrix\(m);_;_) invalid")
end;
 
def set(i;j; value):
setpath([i,j]; value);

Longest Common Substring:

def lcs(a; b):
matrix(a|length; b|length; 0) as $lengths
# state: [ $lengths, greatestLength, answer ]
| [$lengths, 0]
| reduce range(0; a|length) as $i
(.;
reduce range(0; b|length) as $j
(.;
if a[$i:$i+1] == b[$j:$j+1] then
(if $i == 0 or $j == 0 then 1
else .[0][$i-1][$j-1] + 1
end) as $x
| .[0] |= set($i; $j; $x)
| if $x > .[1] then
.[1] = $x
| .[2] = a[1+$i - $x : 1+$i] # output
else .
end
else .
end )) | .[2];

Example:

lcs("thisisatest"; "testing123testing")
Output:
$ jq -n -f Longest_common_substring.jq
"test"

Kotlin[edit]

Translation of: Java
// version 1.1.2
 
fun lcs(a: String, b: String): String {
if (a.length > b.length) return lcs(b, a)
var res = ""
for (ai in 0 until a.length) {
for (len in a.length - ai downTo 1) {
for (bi in 0 until b.length - len) {
if (a.regionMatches(ai, b, bi,len) && len > res.length) {
res = a.substring(ai, ai + len)
}
}
}
}
return res
}
 
fun main(args: Array<String>) = println(lcs("testing123testing", "thisisatest"))
Output:
test

Mathematica[edit]

The function LongestCommonSubsequence returns the longest common substring, and LongestCommonSequence returns the longest common subsequence.

Print[LongestCommonSubsequence["thisisatest", "testing123testing"]];
Output:
test

Perl[edit]

#!/usr/bin/perl
use strict ;
use warnings ;
 
sub longestCommonSubstr {
my $first = shift ;
my $second = shift ;
my %firstsubs = findSubstrings ( $first );
my %secondsubs = findSubstrings ( $second ) ;
my @commonsubs ;
foreach my $subst ( keys %firstsubs ) {
if ( exists $secondsubs{ $subst } ) {
push ( @commonsubs , $subst ) ;
}
}
my @sorted = sort { length $b <=> length $a } @commonsubs ;
return $sorted[0] ;
}
 
sub findSubstrings {
my $string = shift ;
my %substrings ;
my $l = length $string ;
for ( my $start = 0 ; $start < $l ; $start++ ) {
for ( my $howmany = 1 ; $howmany < $l - $start + 1 ; $howmany++) {
$substrings{substr( $string , $start , $howmany) } = 1 ;
}
}
return %substrings ;
}
 
my $longest = longestCommonSubstr( "thisisatest" ,"testing123testing" ) ;
print "The longest common substring of <thisisatest> and <testing123testing> is $longest !\n" ;
 
Output:
The longest common substring of <thisisatest> and <testing123testing> is test !

Perl 6[edit]

use v6;
 
sub createSubstrings( Str $word --> Array ) {
my $length = $word.chars ;
my @substrings ;
for (0..$length - 1) -> $start {
for (1..$length - $start) -> $howmany {
@substrings.push( $word.substr( $start , $howmany ) ) ;
}
}
return @substrings ;
}
 
sub findLongestCommon( Str $first , Str $second --> Str ) {
my @substringsFirst = createSubstrings( $first ) ;
my @substringsSecond = createSubstrings( $second ) ;
my $firstset = set( @substringsFirst ) ;
my $secondset = set( @substringsSecond ) ;
my $common = $firstset (&) $secondset ;
return $common.keys.sort({$^b.chars <=> $^a.chars})[0] ;
}
 
sub MAIN( Str $first , Str $second ) {
my $phrase = "The longest common substring of $first and $second is " ~
"{findLongestCommon( $first , $second ) } !" ;
$phrase.say ;
}
Output:
The longest common substring of thisisatest and testing123testing is test !

PicoLisp[edit]

(de longestCommonSubstring (Str1 Str2)
(setq Str1 (chop Str1) Str2 (chop Str2))
(let Res NIL
(map
'((Lst1)
(map
'((Lst2)
(let Len 0
(find
'((A B) (nand (= A B) (inc 'Len)))
Lst1
Lst2 )
(when (> Len (length Res))
(setq Res (head Len Lst1)) ) ) )
Str2 ) )
Str1 )
(pack Res) ) )

Test:

: (longestCommonSubstring "thisisatest" "testing123testing")
-> "test"

Python[edit]

Using Indexes[edit]

 
import re
s1 = "thisisatest"
s2 = "testing123testing"
longest = ""
i = 0
for x in s1:
if re.search(x, s2):
s = x
while re.search(s, s2):
if len(s)>len(longest):
longest = s
if i+len(s) == len(s1):
break
s = s1[i:i+len(s)+1]
i += 1
print longest
Output:
"test"

Racket[edit]

A chance to show off how to use HashTable types in typed/racket

#lang typed/racket
(: lcs (String String -> String))
(define (lcs a b)
(: all-substrings# (String -> (HashTable String Boolean)))
(define (all-substrings# str)
(define l (string-length str))
(for*/hash : (HashTable String Boolean)
((s (in-range 0 l)) (e (in-range (add1 s) (add1 l))))
(values (substring str s e) #t)))
 
(define a# (all-substrings# a))
 
(define b# (all-substrings# b))
 
(define-values (s l)
(for/fold : (Values String Nonnegative-Integer)
((s "") (l : Nonnegative-Integer 0))
((a_ (in-hash-keys a#))
#:when (and (> (string-length a_) l) (hash-ref b# a_ #f)))
(values a_ (string-length a_))))
 
s)
 
(module+ test
("thisisatest" . lcs . "testing123testing"))
Output:
"test"

REXX[edit]

/*REXX program determines the  LCSUBSTR (Longest Common Substring) subroutine.*/
parse arg a b . /*obtain optional arguments from the CL*/
if a=='' then a= "thisisatest" /*Not specified? Then use the default.*/
if b=='' then b= "testing123testing" /* " " " " " " */
say ' string A =' a /*echo string A to the terminal screen.*/
say ' string B =' b /* " " B " " " " */
say ' LCsubstr =' LCsubstr(a,b) /*display the Longest Common Substring.*/
exit /*stick a fork in it, we're all done. */
/*────────────────────────────────────────────────────────────────────────────*/
LCsubstr: procedure; parse arg x,y,,$ /*LCsubstr: Longest Common Substring. */
L=length(x) /*a placeholder for string length of X*/
do j=1 for L /*step through start points in string X*/
do k=L by -1 for L /*step through string lengths. */
_=strip(substr(x,j,k)) /*extract a common substring. */
if pos(_,y)\==0 & length(_)>length($) then $=_ /*longest? */
end /*k*/ /* [↑] see if this string is longer. */
end /*j*/
return $ /*$: (null if there isn't common str.)*/

output when using the default inputs:

   string A = thisisatest
   string B = testing123testing
   LCsubstr = test

Ring[edit]

 
# Project : Longest Common Substring
# Date  : 2017/09/22
# Author : Gal Zsolt (~ CalmoSoft ~)
# Email  : <[email protected]>
 
str1 = "testing123testing"
str2 = "tsitest"
see longest(str1, str2)
 
func longest(str1, str2)
subarr = []
for n=1 to len(str1)
for m=1 to len(str1)
sub = substr(str1, n, m)
if substr(str2, sub) > 0
add(subarr, sub)
ok
next
next
 
temp = 0
for n=1 to len(subarr)
if len(subarr[n]) > temp
temp = len(subarr[n])
subend = subarr[n]
ok
next
see subend + nl
 

Output:

test

Ruby[edit]

Translation of: C#
def longest_common_substring(a,b)
lengths = Array.new(a.length){Array.new(b.length, 0)}
greatestLength = 0
output = ""
a.each_char.with_index do |x,i|
b.each_char.with_index do |y,j|
next if x != y
lengths[i][j] = (i.zero? || j.zero?) ? 1 : lengths[i-1][j-1] + 1
if lengths[i][j] > greatestLength
greatestLength = lengths[i][j]
output = a[i - greatestLength + 1, greatestLength]
end
end
end
output
end
 
p longest_common_substring("thisisatest", "testing123testing")
Output:
"test"

Sidef[edit]

Translation of: Perl 6
func createSubstrings(String word) -> Array {
gather {
combinations(word.len+1, 2, {|i,j|
take(word.substr(i, j-i))
})
}
}
 
func findLongestCommon(String first, String second) -> String {
createSubstrings(first) & createSubstrings(second) -> max_by { .len }
}
 
say findLongestCommon("thisisatest", "testing123testing")
Output:
test

Swift[edit]

func lComSubStr<
S0: Sliceable, S1: Sliceable, T: Equatable where
S0.Generator.Element == T, S1.Generator.Element == T,
S0.Index.Distance == Int, S1.Index.Distance == Int
>(w1: S0, _ w2: S1) -> S0.SubSlice {
 
var (len, end) = (0, 0)
 
let empty = Array(Repeat(count: w2.count + 1, repeatedValue: 0))
var mat: [[Int]] = Array(Repeat(count: w1.count + 1, repeatedValue: empty))
 
for (i, sLett) in w1.enumerate() {
for (j, tLett) in w2.enumerate() where tLett == sLett {
let curLen = mat[i][j] + 1
mat[i + 1][j + 1] = curLen
if curLen > len {
len = curLen
end = i
}
}
}
return w1[advance(w1.startIndex, (end + 1) - len)...advance(w1.startIndex, end)]
}
 
func lComSubStr(w1: String, _ w2: String) -> String {
return String(lComSubStr(w1.characters, w2.characters))
}

Output:

lComSubStr("thisisatest", "testing123testing") // "test"

VBScript[edit]

 
Function lcs(string1,string2)
For i = 1 To Len(string1)
tlcs = tlcs & Mid(string1,i,1)
If InStr(string2,tlcs) Then
If Len(tlcs) > Len(lcs) Then
lcs = tlcs
End If
Else
tlcs = ""
End If
Next
End Function
 
WScript.Echo lcs(WScript.Arguments(0),WScript.Arguments(1))
 
Output:

Invoke the script from a command prompt.

C:\>cscript.exe /nologo lcs.vbs "thisisatest" "testing123testing"
test

zkl[edit]

fcn lcd(a,b){
if(b.len()<a.len()){ t:=a; a=b; b=t; }
foreach n,m in ([a.len()..1,-1],a.len()-n+1){
s:=a[m,n];
if(b.holds(s)) return(s);
}
""
}
lcd("testing123testing","thisisatest").println();
Output:
test