N-grams: Difference between revisions
Not a robot (talk | contribs) Add SETL |
Not a robot (talk | contribs) Add Refal |
||
Line 965: | Line 965: | ||
<pre>("IV"=>2,"T "=>1,"VE"=>2,"E "=>1,"LE"=>1,"AN"=>1,"LI"=>2,"ND"=>1,"ET"=>1," L"=>2," A"=>1,"D "=>1).Bag |
<pre>("IV"=>2,"T "=>1,"VE"=>2,"E "=>1,"LE"=>1,"AN"=>1,"LI"=>2,"ND"=>1,"ET"=>1," L"=>2," A"=>1,"D "=>1).Bag |
||
("ET "=>1,"AND"=>1,"LIV"=>2," LI"=>1,"ND "=>1," LE"=>1,"IVE"=>2,"E A"=>1,"VE "=>1,"T L"=>1,"D L"=>1,"LET"=>1," AN"=>1).Bag</pre> |
("ET "=>1,"AND"=>1,"LIV"=>2," LI"=>1,"ND "=>1," LE"=>1,"IVE"=>2,"E A"=>1,"VE "=>1,"T L"=>1,"D L"=>1,"LET"=>1," AN"=>1).Bag</pre> |
||
=={{header|Refal}}== |
|||
<syntaxhighlight lang="refal">$ENTRY Go { |
|||
, 'LIVE AND LET LIVE': e.Str |
|||
= <ShowNgrams 2 e.Str> |
|||
<ShowNgrams 3 e.Str> |
|||
<ShowNgrams 4 e.Str>; |
|||
}; |
|||
ShowNgrams { |
|||
s.N e.Str = |
|||
<Prout <Symb s.N> '-grams of "' e.Str '":'> |
|||
<ShowLines 5 <Ngrams s.N e.Str>> |
|||
<Prout>; |
|||
}; |
|||
ShowLines { |
|||
s.N = ; |
|||
s.N e.X, <First s.N e.X>: (e.L) e.R = |
|||
<Prout <Each DispNgram e.L>> <ShowLines s.N e.R>; |
|||
}; |
|||
Each { |
|||
s.F = ; |
|||
s.F t.I e.Is = <Mu s.F t.I> <Each s.F e.Is>; |
|||
}; |
|||
DispNgram { |
|||
((e.S) s.C) = '(' e.S ') - ' <Symb s.C> ' '; |
|||
}; |
|||
Ngrams { |
|||
s.N e.Str = <Count () <Groups s.N e.Str>>; |
|||
}; |
|||
Groups { |
|||
s.N e.X, <Lenw e.X>: s.L e.X, <Compare s.L s.N>: { |
|||
'-' = ; |
|||
s.C, <First s.N e.X>: (e.G) e.R, e.X: s.Z e.Y = |
|||
(e.G) <Groups s.N e.Y>; |
|||
} |
|||
}; |
|||
Count { |
|||
(e.Cs) = e.Cs; |
|||
(e.Cs) t.I e.Is = <Count (<Inc (e.Cs) t.I>) e.Is>; |
|||
}; |
|||
Inc { |
|||
(e.X (t.I s.C) e.Y) t.I = e.X (t.I <+ 1 s.C>) e.Y; |
|||
(e.X) t.I = e.X (t.I 1); |
|||
};</syntaxhighlight> |
|||
{{out}} |
|||
<pre>2-grams of "LIVE AND LET LIVE": |
|||
(LI) - 2 (IV) - 2 (VE) - 2 (E ) - 1 ( A) - 1 |
|||
(AN) - 1 (ND) - 1 (D ) - 1 ( L) - 2 (LE) - 1 |
|||
(ET) - 1 (T ) - 1 |
|||
3-grams of "LIVE AND LET LIVE": |
|||
(LIV) - 2 (IVE) - 2 (VE ) - 1 (E A) - 1 ( AN) - 1 |
|||
(AND) - 1 (ND ) - 1 (D L) - 1 ( LE) - 1 (LET) - 1 |
|||
(ET ) - 1 (T L) - 1 ( LI) - 1 |
|||
4-grams of "LIVE AND LET LIVE": |
|||
(LIVE) - 2 (IVE ) - 1 (VE A) - 1 (E AN) - 1 ( AND) - 1 |
|||
(AND ) - 1 (ND L) - 1 (D LE) - 1 ( LET) - 1 (LET ) - 1 |
|||
(ET L) - 1 (T LI) - 1 ( LIV) - 1</pre> |
|||
=={{header|RPL}}== |
=={{header|RPL}}== |
Revision as of 10:33, 10 May 2024
An N-gram is a sequence of N contiguous elements of a given text. Although N-grams refer sometimes to words or syllables, in this task we will consider only sequences of characters. The task consists in, given a text and an integer size of the desired N-grams, find all the different contiguous sequences of N characters, together with the number of times they appear in the text. For example, the 2-grams of the text "Live and let live" are:
"LI" - 2 "IV" - 2 "VE" - 2 " L" - 2 "E " - 1 " A" - 1 "AN" - 1 "ND" - 1 "D " - 1 "LE" - 1 "ET" - 1 "T " - 1
Note that space and other non-alphanumeric characters are taken into account.
- See also
ALGOL 68
BEGIN # split text into n-grams - n character substrings #
MODE NGRAM = STRUCT( STRING gram, INT count );
OP UCASE = ( CHAR c )CHAR: # return c converted to upper case #
IF c >= "a" AND c <= "z" THEN REPR( ( ABS c - ABS "a" ) + ABS "A" ) ELSE c FI;
OP UCASE = ( STRING s )STRING: # return s converted to upper case #
BEGIN
STRING uc := s;
FOR i FROM LWB uc TO UPB uc DO uc[ i ] := UCASE uc[ i ] OD;
uc
END # UCASE # ;
OP LENGTH = ( STRING s )INT: ( UPB s + 1 ) - LWB s;
# returns the n-grams of text - using a simple array to contain the #
# n-grams - for longer strings, an associative array might be better #
PRIO GRAMS = 1;
OP GRAMS = ( INT n, STRING text )[]NGRAM:
BEGIN
[ 1 : ( LENGTH text + 1 ) - n ]NGRAM result; # initially assume #
INT count := 0; # all the n-grams will be unique #
FOR pos FROM LWB text TO ( UPB text + 1 )- n DO
STRING ng = UCASE text[ pos : pos + ( n - 1 ) ];
BOOL found := FALSE;
INT g pos := 0;
FOR g FROM 1 TO count
WHILE g pos := g;
NOT ( found := ng = ( gram OF result )[ g ] )
DO SKIP OD;
IF NOT found THEN
result[ count +:= 1 ] := ( ng, 1 )
ELSE
( count OF result )[ g pos ] +:= 1
FI
OD;
result[ 1 : count ]
END # NGRAMS # ;
# prints the ngrams in ngrams #
PROC print ngrams = ( STRING title, text, []NGRAM ngrams )VOID:
BEGIN
print( ( title, "-grams of """, text, """:", newline ) );
FOR g FROM LWB ngrams TO UPB ngrams DO
print( ( " """, ( gram OF ngrams )[ g ] ) );
print( ( """: ", whole( ( count OF ngrams )[ g ], 0 ), newline ) )
OD
END # print ngrams # ;
STRING test = "Live and let live";
print ngrams( "bi", test, 2 GRAMS test );
print ngrams( "tri", test, 3 GRAMS test );
print ngrams( "quad", test, 4 GRAMS test )
END
- Output:
bi-grams of "Live and let live": "LI": 2 "IV": 2 "VE": 2 "E ": 1 " A": 1 "AN": 1 "ND": 1 "D ": 1 " L": 2 "LE": 1 "ET": 1 "T ": 1 tri-grams of "Live and let live": "LIV": 2 "IVE": 2 "VE ": 1 "E A": 1 " AN": 1 "AND": 1 "ND ": 1 "D L": 1 " LE": 1 "LET": 1 "ET ": 1 "T L": 1 " LI": 1 quad-grams of "Live and let live": "LIVE": 2 "IVE ": 1 "VE A": 1 "E AN": 1 " AND": 1 "AND ": 1 "ND L": 1 "D LE": 1 " LET": 1 "LET ": 1 "ET L": 1 "T LI": 1 " LIV": 1
APL
ngrams ← (⊣,(≢⊢))⌸,/
- Output:
2 3 4 ngrams¨ ⊂'LIVE AND LET LIVE' LI 2 LIV 2 LIVE 2 IV 2 IVE 2 IVE 1 VE 2 VE 1 VE A 1 E 1 E A 1 E AN 1 A 1 AN 1 AND 1 AN 1 AND 1 AND 1 ND 1 ND 1 ND L 1 D 1 D L 1 D LE 1 L 2 LE 1 LET 1 LE 1 LET 1 LET 1 ET 1 ET 1 ET L 1 T 1 T L 1 T LI 1 LI 1 LIV 1
Arturo
ngrams: function [s :string n :integer][
0..sub size s n | map 'i -> slice upper s i i+n-1
| tally
]
loop [2 3 4] 'n [
print ~"|n|-grams:"
loop ngrams "Live and let live" n [k v] -> print [~{"|k|"} v]
print ""
]
- Output:
2-grams: "LI" 2 "IV" 2 "VE" 2 "E " 1 " A" 1 "AN" 1 "ND" 1 "D " 1 " L" 2 "LE" 1 "ET" 1 "T " 1 3-grams: "LIV" 2 "IVE" 2 "VE " 1 "E A" 1 " AN" 1 "AND" 1 "ND " 1 "D L" 1 " LE" 1 "LET" 1 "ET " 1 "T L" 1 " LI" 1 4-grams: "LIVE" 2 "IVE " 1 "VE A" 1 "E AN" 1 " AND" 1 "AND " 1 "ND L" 1 "D LE" 1 " LET" 1 "LET " 1 "ET L" 1 "T LI" 1 " LIV" 1
C
#include <stdio.h>
#include <stdbool.h>
#include <ctype.h>
#include <string.h>
#define MAX_N 4
#define MAX_NGRAMS 20
typedef struct {
char str[MAX_N+1];
int freq;
} ngram;
void *strUpper(char *s) {
while (*s) {
*s = toupper(*s);
s++;
}
}
void ngrams(int n, char *text) {
int i, j, count = 0;
size_t len = strlen(text);
bool found;
char temp[MAX_N+1] = {'\0'};
ngram ng, ngrams[MAX_NGRAMS];
char s[len+1];
strcpy(s, text);
strUpper(s);
for (i = 0; i <= len-n; ++i) {
strncpy(temp, s + i, n);
found = false;
for (j = 0; j < count; ++j) {
if (!strcmp(ngrams[j].str, temp)) {
ngrams[j].freq++;
found = true;
break;
}
}
if (!found) {
strncpy(ng.str, temp, n);
ng.freq = 1;
ngrams[count++] = ng;
}
}
for (i = 0; i < count; ++i) {
printf("(\"%s\": %d) ", ngrams[i].str, ngrams[i].freq);
if (!((i+1)%5)) printf("\n");
}
printf("\n\n");
}
int main() {
int n;
char *text = "Live and let live";
for (n = 2; n <= MAX_N; ++n) {
printf("All %d-grams of '%s' and their frequencies:\n", n, text);
ngrams(n, text);
}
return 0;
}
- Output:
All 2-grams of 'Live and let live' and their frequencies: ("LI": 2) ("IV": 2) ("VE": 2) ("E ": 1) (" A": 1) ("AN": 1) ("ND": 1) ("D ": 1) (" L": 2) ("LE": 1) ("ET": 1) ("T ": 1) All 3-grams of 'Live and let live' and their frequencies: ("LIV": 2) ("IVE": 2) ("VE ": 1) ("E A": 1) (" AN": 1) ("AND": 1) ("ND ": 1) ("D L": 1) (" LE": 1) ("LET": 1) ("ET ": 1) ("T L": 1) (" LI": 1) All 4-grams of 'Live and let live' and their frequencies: ("LIVE": 2) ("IVE ": 1) ("VE A": 1) ("E AN": 1) (" AND": 1) ("AND ": 1) ("ND L": 1) ("D LE": 1) (" LET": 1) ("LET ": 1) ("ET L": 1) ("T LI": 1) (" LIV": 1)
Common Lisp
A hash table is used to store and retrieve the n-grams fast.
(defun n-grams (text n)
"Return a list of all the N-grams of length n in the text, together with their frequency"
(let* (res (*ht-n-grams* (make-hash-table :test 'equal)) )
(loop for i from 0 to (- (length text) n) do
(let* ((n-gram (string-upcase (subseq text i (+ i n))))
(freq (gethash n-gram *ht-n-grams*)))
(setf (gethash n-gram *ht-n-grams*) (if (null freq) 1 (1+ freq))) ))
(maphash #'(lambda (key val)
(push (cons key val) res) )
*ht-n-grams* )
(sort res #'> :key #'cdr) ))
- Output:
> (n-grams "Live and let live" 2)
(("LI" . 2) ("IV" . 2) ("VE" . 2) (" L" . 2) ("E " . 1) (" A" . 1) ("AN" . 1)
("ND" . 1) ("D " . 1) ("LE" . 1) ("ET" . 1) ("T " . 1))
F#
// N-grams. Nigel Galloway: April 2nd., 2024
let gram (n:string) g=let n=n.ToUpper() in n|>Seq.windowed g|>Seq.countBy id
for n,g in (gram "Live and let live" 2) do printfn "%A %d" n g
- Output:
[|'L'; 'I'|] 2 [|'I'; 'V'|] 2 [|'V'; 'E'|] 2 [|'E'; ' '|] 1 [|' '; 'A'|] 1 [|'A'; 'N'|] 1 [|'N'; 'D'|] 1 [|'D'; ' '|] 1 [|' '; 'L'|] 2 [|'L'; 'E'|] 1 [|'E'; 'T'|] 1 [|'T'; ' '|] 1
Factor
USING: ascii grouping kernel math.statistics prettyprint ;
: n-grams ( str n -- assoc ) [ >upper ] dip clump histogram ;
"Live and let live" 2 n-grams .
- Output:
H{ { "ET" 1 } { "IV" 2 } { "T " 1 } { " A" 1 } { "VE" 2 } { "LI" 2 } { "E " 1 } { "D " 1 } { " L" 2 } { "ND" 1 } { "LE" 1 } { "AN" 1 } }
Haskell
import Control.Applicative (ZipList (ZipList, getZipList))
import Data.Char (toUpper)
import Data.List (tails)
import qualified Data.Map.Strict as M
------------------- MAP OF N-GRAM COUNTS -----------------
nGramCounts :: Int -> String -> M.Map String Int
nGramCounts n =
foldr (flip (M.insertWith (+)) 1) M.empty . windows n
------------------------- GENERIC ------------------------
windows :: Int -> [a] -> [[a]]
windows n = transpose . take n . tails
transpose :: [[a]] -> [[a]]
transpose [] = []
transpose xs = getZipList (traverse ZipList xs)
--------------------------- TEST -------------------------
main :: IO ()
main =
let sample = toUpper <$> "Live and let live"
in mapM_
( \n ->
putStrLn (show n <> "-GRAMS:")
>> mapM_ print ((M.assocs . nGramCounts n) sample)
>> putStrLn ""
)
[0 .. 4]
- Output:
0-GRAMS: 1-GRAMS: (" ",3) ("A",1) ("D",1) ("E",3) ("I",2) ("L",3) ("N",1) ("T",1) ("V",2) 2-GRAMS: (" A",1) (" L",2) ("AN",1) ("D ",1) ("E ",1) ("ET",1) ("IV",2) ("LE",1) ("LI",2) ("ND",1) ("T ",1) ("VE",2) 3-GRAMS: (" AN",1) (" LE",1) (" LI",1) ("AND",1) ("D L",1) ("E A",1) ("ET ",1) ("IVE",2) ("LET",1) ("LIV",2) ("ND ",1) ("T L",1) ("VE ",1) 4-GRAMS: (" AND",1) (" LET",1) (" LIV",1) ("AND ",1) ("D LE",1) ("E AN",1) ("ET L",1) ("IVE ",1) ("LET ",1) ("LIVE",2) ("ND L",1) ("T LI",1) ("VE A",1)
jq
Works with jq and gojq, that is, the C and Go implementations of jq.
# Generic "bag of words" utility:
def bow(stream):
reduce stream as $word ({}; .[($word|tostring)] += 1);
# The ngrams as a bow
def ngrams($n):
ascii_upcase as $text
| bow( range(0;$text|1+ length - $n) as $i | $text[$i:$i+$n]);
# The task
# Sort by increasing frequency, then by lexicographical order
def ngrams($text; $n):
($text|ngrams($n)) as $ngrams
| "\nAll \($n)-grams of '\($text)' and their frequencies:",
($ngrams|to_entries|sort_by(.value,.key)[] | "\(.key): \(.value)" ) ;
ngrams("Live and let live"; 2,3,4)
- Output:
All 2-grams of 'Live and let live' and their frequencies: A: 1 AN: 1 D : 1 E : 1 ET: 1 LE: 1 ND: 1 T : 1 L: 2 IV: 2 LI: 2 VE: 2 All 3-grams of 'Live and let live' and their frequencies: AN: 1 LE: 1 LI: 1 AND: 1 D L: 1 E A: 1 ET : 1 LET: 1 ND : 1 T L: 1 VE : 1 IVE: 2 LIV: 2 All 4-grams of 'Live and let live' and their frequencies: AND: 1 LET: 1 LIV: 1 AND : 1 D LE: 1 E AN: 1 ET L: 1 IVE : 1 LET : 1 ND L: 1 T LI: 1 VE A: 1 LIVE: 2
Julia
function ngrams(str::AbstractString, n; uppercaseinput = true)
s = uppercaseinput ? uppercase(str) : str
unique([(ng, count(ng, s)) for ng in [SubString(s, i:i+n-1) for i=1:length(s)-n+1]])
end
function eightcolumns(arr)
for (i, elem) in pairs(arr)
print(lpad(elem, 10), i % 8 == 0 ? "\n" : "")
end
println("\n")
end
const s = "Live and let live"
ngrams(s, 1) |> eightcolumns
ngrams(s, 2) |> eightcolumns
ngrams(s, 2, uppercaseinput = false) |> eightcolumns
- Output:
("L", 3) ("I", 2) ("V", 2) ("E", 3) (" ", 3) ("A", 1) ("N", 1) ("D", 1) ("T", 1) ("LI", 2) ("IV", 2) ("VE", 2) ("E ", 1) (" A", 1) ("AN", 1) ("ND", 1) ("D ", 1) (" L", 2) ("LE", 1) ("ET", 1) ("T ", 1) ("Li", 1) ("iv", 2) ("ve", 2) ("e ", 1) (" a", 1) ("an", 1) ("nd", 1) ("d ", 1) (" l", 2) ("le", 1) ("et", 1) ("t ", 1) ("li", 1)
Nim
import std/[strutils, tables]
type NGrams = CountTable[string]
func ngrams(text: string; n: Positive): NGrams =
for i in 0..(text.len - n):
result.inc(text[i..<(i + n)].toLowerAscii)
const Text = "Live and let live"
for n in 2..4:
echo n, "-grams:"
var ng = Text.ngrams(n)
ng.sort() # To display n-grams with higher score first.
for key, count in ng:
echo "“$1”: $2".format(key, count)
echo()
- Output:
2-grams: “ve”: 2 “li”: 2 “iv”: 2 “ l”: 2 “d ”: 1 “et”: 1 “t ”: 1 “an”: 1 “nd”: 1 “e ”: 1 “le”: 1 “ a”: 1 3-grams: “ive”: 2 “liv”: 2 “ le”: 1 “nd ”: 1 “and”: 1 “et ”: 1 “ve ”: 1 “t l”: 1 “ an”: 1 “d l”: 1 “e a”: 1 “let”: 1 “ li”: 1 4-grams: “live”: 2 “ liv”: 1 “ and”: 1 “e an”: 1 “ let”: 1 “and ”: 1 “d le”: 1 “t li”: 1 “nd l”: 1 “et l”: 1 “ive ”: 1 “let ”: 1 “ve a”: 1
Perl
use v5.36;
sub n_gram ($n, $line) {
my %N;
map { $N{substr lc($line),$_,$n}++ } 0..length($line)-$n;
%N
}
my %bi_grams = n_gram 2, 'Live and let live';
say qq|'$_' - $bi_grams{$_}| for sort keys %bi_grams;
say '';
my %tri_grams = n_gram 3, 'Live and let live';
say qq|'$_' - $tri_grams{$_}| for sort keys %tri_grams;
- Output:
' a' - 1 ' l' - 2 'an' - 1 'd ' - 1 'e ' - 1 'et' - 1 'iv' - 2 'le' - 1 'li' - 2 'nd' - 1 't ' - 1 've' - 2 ' an' - 1 ' le' - 1 ' li' - 1 'and' - 1 'd l' - 1 'e a' - 1 'et ' - 1 'ive' - 2 'let' - 1 'liv' - 2 'nd ' - 1 't l' - 1 've ' - 1
Phix
A dictionary is used to find the index of already-seen n-grams, even though a simpler find() would be good enough for this task.
I have replicated most orderings found on this page, the task description order corresponds to orig/freq,
and jq is alpha/freq with high last, but there is no equivalent for the Factor or Raku orderings here ;-).
with javascript_semantics function n_grams(integer len, string txt, sequence orders) sequence ng = {}, ngc = {} integer d = new_dict() txt = upper(txt) for i=1 to length(txt)-len+1 do string tn = txt[i..i+len-1] integer ndx = getdd(tn,0,d) if ndx=0 then ng = append(ng,tn) ngc = append(ngc,1) ndx = length(ng) setd(tn,ndx,d) else ngc[ndx] += 1 end if end for destroy_dict(d) integer l = length(ng) sequence ares = columnize({ng,ngc,tagset(l)}), res = {} for c in orders do if c="original" then -- original/first found order res = append(res,ares) elsif c="orig/freq" then -- "" but higher freq first res = append(res,sort_columns(ares,{-2,3})) elsif c="alphabetic" then -- alphabetical order res = append(res,sort(deep_copy(ares))) elsif c="alpha/freq" then -- "" but higher freq first res = append(res,sort_columns(ares,{-2,1})) else ?9/0 -- (unknown ordering requested) end if end for return res end function constant src = "Live and let live", orders = {"original","orig/freq", "alphabetic","alpha/freq"} printf(1,"For \"%s\":\n",src) for l=2 to 4 do sequence res = n_grams(l,src,orders) string count = ordinal(length(res[1]),true) printf(1,"There are %s unique %d-grams:\n",{count,l}) for i,r in res do printf(1,"%12s: %s\n",{orders[i],join(r,", ",fmt:="%s %d")}) end for end for
- Output:
For "Live and let live": There are twelve unique 2-grams: original: LI 2, IV 2, VE 2, E 1, A 1, AN 1, ND 1, D 1, L 2, LE 1, ET 1, T 1 orig/freq: LI 2, IV 2, VE 2, L 2, E 1, A 1, AN 1, ND 1, D 1, LE 1, ET 1, T 1 alphabetic: A 1, L 2, AN 1, D 1, E 1, ET 1, IV 2, LE 1, LI 2, ND 1, T 1, VE 2 alpha/freq: L 2, IV 2, LI 2, VE 2, A 1, AN 1, D 1, E 1, ET 1, LE 1, ND 1, T 1 There are thirteen unique 3-grams: original: LIV 2, IVE 2, VE 1, E A 1, AN 1, AND 1, ND 1, D L 1, LE 1, LET 1, ET 1, T L 1, LI 1 orig/freq: LIV 2, IVE 2, VE 1, E A 1, AN 1, AND 1, ND 1, D L 1, LE 1, LET 1, ET 1, T L 1, LI 1 alphabetic: AN 1, LE 1, LI 1, AND 1, D L 1, E A 1, ET 1, IVE 2, LET 1, LIV 2, ND 1, T L 1, VE 1 alpha/freq: IVE 2, LIV 2, AN 1, LE 1, LI 1, AND 1, D L 1, E A 1, ET 1, LET 1, ND 1, T L 1, VE 1 There are thirteen unique 4-grams: original: LIVE 2, IVE 1, VE A 1, E AN 1, AND 1, AND 1, ND L 1, D LE 1, LET 1, LET 1, ET L 1, T LI 1, LIV 1 orig/freq: LIVE 2, IVE 1, VE A 1, E AN 1, AND 1, AND 1, ND L 1, D LE 1, LET 1, LET 1, ET L 1, T LI 1, LIV 1 alphabetic: AND 1, LET 1, LIV 1, AND 1, D LE 1, E AN 1, ET L 1, IVE 1, LET 1, LIVE 2, ND L 1, T LI 1, VE A 1 alpha/freq: LIVE 2, AND 1, LET 1, LIV 1, AND 1, D LE 1, E AN 1, ET L 1, IVE 1, LET 1, ND L 1, T LI 1, VE A 1
Python
import pprint
from collections import Counter
from typing import Iterable
def n_grams(text: str, n: int) -> Iterable[str]:
"""Generate contiguous sequences of _n_ characters from _text_."""
if n < 1:
raise ValueError("n must be an integer > 0")
text = text.upper()
return (text[i : (i + n)] for i in range(len(text) - n + 1))
def main() -> None:
example_text = "Live and let live"
for n in range(2, 5):
counts = Counter(n_grams(example_text, n)).most_common()
print(
f"{len(counts)} {n}-grams of {example_text!r}:\n",
pprint.pformat(counts, compact=True),
end="\n\n",
)
if __name__ == "__main__":
main()
- Output:
12 2-grams of 'Live and let live': [('LI', 2), ('IV', 2), ('VE', 2), (' L', 2), ('E ', 1), (' A', 1), ('AN', 1), ('ND', 1), ('D ', 1), ('LE', 1), ('ET', 1), ('T ', 1)] 13 3-grams of 'Live and let live': [('LIV', 2), ('IVE', 2), ('VE ', 1), ('E A', 1), (' AN', 1), ('AND', 1), ('ND ', 1), ('D L', 1), (' LE', 1), ('LET', 1), ('ET ', 1), ('T L', 1), (' LI', 1)] 13 4-grams of 'Live and let live': [('LIVE', 2), ('IVE ', 1), ('VE A', 1), ('E AN', 1), (' AND', 1), ('AND ', 1), ('ND L', 1), ('D LE', 1), (' LET', 1), ('LET ', 1), ('ET L', 1), ('T LI', 1), (' LIV', 1)]
Sliding window
This example takes inspiration from the sliding_window recipe found in Python's itertools docs.
import pprint
from collections import Counter
from collections import deque
from itertools import islice
from typing import Iterable
def n_grams(text: str, n: int) -> Iterable[str]:
"""Generate contiguous sequences of _n_ characters from _text_."""
it = iter(text.upper())
n_gram = deque(islice(it, n), maxlen=n)
if len(n_gram) == n:
yield "".join(n_gram)
for x in it:
n_gram.append(x)
yield "".join(n_gram)
def main() -> None:
example_text = "Live and let live"
for n in range(2, 5):
counts = Counter(n_grams(example_text, n)).most_common()
print(
f"{len(counts)} {n}-grams of {example_text!r}:\n",
pprint.pformat(counts, compact=True),
end="\n\n",
)
if __name__ == "__main__":
main()
- Output:
12 2-grams of 'Live and let live': [('LI', 2), ('IV', 2), ('VE', 2), (' L', 2), ('E ', 1), (' A', 1), ('AN', 1), ('ND', 1), ('D ', 1), ('LE', 1), ('ET', 1), ('T ', 1)] 13 3-grams of 'Live and let live': [('LIV', 2), ('IVE', 2), ('VE ', 1), ('E A', 1), (' AN', 1), ('AND', 1), ('ND ', 1), ('D L', 1), (' LE', 1), ('LET', 1), ('ET ', 1), ('T L', 1), (' LI', 1)] 13 4-grams of 'Live and let live': [('LIVE', 2), ('IVE ', 1), ('VE A', 1), ('E AN', 1), (' AND', 1), ('AND ', 1), ('ND L', 1), ('D LE', 1), (' LET', 1), ('LET ', 1), ('ET L', 1), ('T LI', 1), (' LIV', 1)]
And a strict variant, compositionally assembled from some basics:
from itertools import (islice)
from functools import (reduce)
from operator import (add)
def nGramCounts(n, s):
'''A dictionary of all nGrams of dimension n in s,
with the frequency of their occurrence.
'''
return reduce(
lambda a, gram: insertWith(add, gram, 1, a),
nGrams(n, s),
{}
)
def nGrams(n, s):
'''All case-insensitive sequences of length n in the string s.'''
return (''.join(t) for t in windows(n, list(s.upper())))
# ----------------------- GENERICS -----------------------
def insertWith(f, k, x, dct):
'''A new dictionary updated with a
(key, f(value, x)) tuple.
Where there is no existing value for the key,
the supplied x is used as the default.
'''
return dict(dct, **{k: f(dct[k], x) if k in dct else x})
def tails(xs):
'''All final segments of xs, longest first.'''
return (xs[i:] for i in range(0, 1 + len(xs)))
def windows(n, xs):
'''Sliding windows of dimension n.'''
return zip(*islice(tails(xs), n))
# ------------------------- TEST -------------------------
if __name__ == "__main__":
import pprint
EXAMPLE = "Live and let live"
for dimension in range(1, 5):
result = sorted(nGramCounts(dimension, EXAMPLE).items())
print(
f"{len(result)} {dimension}-grams of {EXAMPLE!r}:\n",
pprint.pformat(result),
end="\n\n",
)
- Output:
9 1-grams of 'Live and let live': [(' ', 3), ('A', 1), ('D', 1), ('E', 3), ('I', 2), ('L', 3), ('N', 1), ('T', 1), ('V', 2)] 12 2-grams of 'Live and let live': [(' A', 1), (' L', 2), ('AN', 1), ('D ', 1), ('E ', 1), ('ET', 1), ('IV', 2), ('LE', 1), ('LI', 2), ('ND', 1), ('T ', 1), ('VE', 2)] 13 3-grams of 'Live and let live': [(' AN', 1), (' LE', 1), (' LI', 1), ('AND', 1), ('D L', 1), ('E A', 1), ('ET ', 1), ('IVE', 2), ('LET', 1), ('LIV', 2), ('ND ', 1), ('T L', 1), ('VE ', 1)] 13 4-grams of 'Live and let live': [(' AND', 1), (' LET', 1), (' LIV', 1), ('AND ', 1), ('D LE', 1), ('E AN', 1), ('ET L', 1), ('IVE ', 1), ('LET ', 1), ('LIVE', 2), ('ND L', 1), ('T LI', 1), ('VE A', 1)]
Raku
sub n-gram ($this, $N=2) { Bag.new( flat $this.uc.map: { .comb.rotor($N => -($N-1))».join } ) }
dd 'Live and let live'.&n-gram; # bi-gram
dd 'Live and let live'.&n-gram(3); # tri-gram
- Output:
("IV"=>2,"T "=>1,"VE"=>2,"E "=>1,"LE"=>1,"AN"=>1,"LI"=>2,"ND"=>1,"ET"=>1," L"=>2," A"=>1,"D "=>1).Bag ("ET "=>1,"AND"=>1,"LIV"=>2," LI"=>1,"ND "=>1," LE"=>1,"IVE"=>2,"E A"=>1,"VE "=>1,"T L"=>1,"D L"=>1,"LET"=>1," AN"=>1).Bag
Refal
$ENTRY Go {
, 'LIVE AND LET LIVE': e.Str
= <ShowNgrams 2 e.Str>
<ShowNgrams 3 e.Str>
<ShowNgrams 4 e.Str>;
};
ShowNgrams {
s.N e.Str =
<Prout <Symb s.N> '-grams of "' e.Str '":'>
<ShowLines 5 <Ngrams s.N e.Str>>
<Prout>;
};
ShowLines {
s.N = ;
s.N e.X, <First s.N e.X>: (e.L) e.R =
<Prout <Each DispNgram e.L>> <ShowLines s.N e.R>;
};
Each {
s.F = ;
s.F t.I e.Is = <Mu s.F t.I> <Each s.F e.Is>;
};
DispNgram {
((e.S) s.C) = '(' e.S ') - ' <Symb s.C> ' ';
};
Ngrams {
s.N e.Str = <Count () <Groups s.N e.Str>>;
};
Groups {
s.N e.X, <Lenw e.X>: s.L e.X, <Compare s.L s.N>: {
'-' = ;
s.C, <First s.N e.X>: (e.G) e.R, e.X: s.Z e.Y =
(e.G) <Groups s.N e.Y>;
}
};
Count {
(e.Cs) = e.Cs;
(e.Cs) t.I e.Is = <Count (<Inc (e.Cs) t.I>) e.Is>;
};
Inc {
(e.X (t.I s.C) e.Y) t.I = e.X (t.I <+ 1 s.C>) e.Y;
(e.X) t.I = e.X (t.I 1);
};
- Output:
2-grams of "LIVE AND LET LIVE": (LI) - 2 (IV) - 2 (VE) - 2 (E ) - 1 ( A) - 1 (AN) - 1 (ND) - 1 (D ) - 1 ( L) - 2 (LE) - 1 (ET) - 1 (T ) - 1 3-grams of "LIVE AND LET LIVE": (LIV) - 2 (IVE) - 2 (VE ) - 1 (E A) - 1 ( AN) - 1 (AND) - 1 (ND ) - 1 (D L) - 1 ( LE) - 1 (LET) - 1 (ET ) - 1 (T L) - 1 ( LI) - 1 4-grams of "LIVE AND LET LIVE": (LIVE) - 2 (IVE ) - 1 (VE A) - 1 (E AN) - 1 ( AND) - 1 (AND ) - 1 (ND L) - 1 (D LE) - 1 ( LET) - 1 (LET ) - 1 (ET L) - 1 (T LI) - 1 ( LIV) - 1
RPL
RPL code | Comment |
---|---|
≪ → text n ≪ { } DUP n text SIZE FOR j text j n - 1 + j SUB IF DUP2 POS THEN LAST 4 ROLL SWAP DUP2 GET 1 + PUT SWAP DROP SWAP ELSE + SWAP 1 + SWAP END NEXT SHOWG ≫ ≫ ‘-GRAMS’ STO ≪ { } 1 3 PICK SIZE FOR j OVER j GET "=" + 4 PICK j GET →STR + + NEXT ROT ROT DROP2 ≫ ‘SHOWG’ STO |
-GRAMS ( text n -- { "ngram=count".. } ) Initialize 2 empty lists; for j = n to length(text): ngram = text[j-n+1..j] if ngram already in ngram list increase counter in other list get rid of ngram else add to ngram list and set counter at 1 on the other list Show results SHOWG ( { "ngram".. } { counts.. } -- { "ngram=count".. } ) |
- Input:
"LIVE AND LET LIVE" 2 -GRAMS "LIVE AND LET LIVE" 3 -GRAMS "LIVE AND LET LIVE" 4 -GRAMS
- Output:
3: { "LI=2" "IV=2" "VE=2" "E =1" " A=1" "AN=1" "ND=1" "D =1" " L=2" "LE=1" "ET=1" "T =1" } 2: { "LIV=2" "IVE=2" "VE =1" "E A=1" " AN=1" "AND=1" "ND =1" "D L=1" " LE=1" "LET=1" "ET =1" "T L=1" " LI=1" } 1: { "LIVE=2" "IVE =1" "VE A=1" "E AN=1" " AND=1" "AND =1" "ND L=1" "D LE=1" " LET=1" "LET =1" "ET L=1" "T LI=1" " LIV=1" }
SETL
program find_ngrams;
input := "LIVE AND LET LIVE";
loop for size in [2,3,4] do
print(str size+"-grams of '"+input+"':");
ng := ngrams(input, size);
col := 0;
loop for count = ng(ngram) do
nprint(rpad("'" + ngram + "': " + str count, 10));
if (col +:= 1) mod 8 = 0 then print; end if;
end loop;
print;
print;
end loop;
proc ngrams(input, size);
ng := {};
loop for i in [1..#input-size+1] do
ng(input(i..i+size-1)) +:= 1;
end loop;
return ng;
end proc;
end program;
- Output:
2-grams of 'LIVE AND LET LIVE': ' A': 1 ' L': 2 'AN': 1 'D ': 1 'E ': 1 'ET': 1 'IV': 2 'LE': 1 'LI': 2 'ND': 1 'T ': 1 'VE': 2 3-grams of 'LIVE AND LET LIVE': ' AN': 1 ' LE': 1 ' LI': 1 'AND': 1 'D L': 1 'E A': 1 'ET ': 1 'IVE': 2 'LET': 1 'LIV': 2 'ND ': 1 'T L': 1 'VE ': 1 4-grams of 'LIVE AND LET LIVE': ' AND': 1 ' LET': 1 ' LIV': 1 'AND ': 1 'D LE': 1 'E AN': 1 'ET L': 1 'IVE ': 1 'LET ': 1 'LIVE': 2 'ND L': 1 'T LI': 1 'VE A': 1
Wren
Version 1 (Sorted order)
import "./str" for Str
import "./maputil" for MultiSet
import "./fmt" for Fmt
var findNgrams = Fn.new { |n, text|
text = Str.upper(text)
var ngrams = {}
for (i in 0..text.count-n) {
var ngram = text[i...i+n]
MultiSet.add(ngrams, ngram)
}
return ngrams
}
// sort by decreasing frequency, then by lexicographical order
var comparer = Fn.new { |i, j|
if (i.value != j.value) return j.value < i.value
return Str.lt(i.key, j.key)
}
var text = "Live and let live"
for (n in [2, 3, 4]) {
var ngrams = findNgrams.call(n, text)
System.print("All %(n)-grams of '%(text)' and their frequencies:")
var ng = ngrams.toList.sort(comparer).map { |me| "(\"%(me.key)\" : %(me.value))"}
Fmt.tprint("$s ", ng, 5)
System.print()
}
- Output:
All 2-grams of 'Live and let live' and their frequencies: (" L" : 2) ("IV" : 2) ("LI" : 2) ("VE" : 2) (" A" : 1) ("AN" : 1) ("D " : 1) ("E " : 1) ("ET" : 1) ("LE" : 1) ("ND" : 1) ("T " : 1) All 3-grams of 'Live and let live' and their frequencies: ("IVE" : 2) ("LIV" : 2) (" AN" : 1) (" LE" : 1) (" LI" : 1) ("AND" : 1) ("D L" : 1) ("E A" : 1) ("ET " : 1) ("LET" : 1) ("ND " : 1) ("T L" : 1) ("VE " : 1) All 4-grams of 'Live and let live' and their frequencies: ("LIVE" : 2) (" AND" : 1) (" LET" : 1) (" LIV" : 1) ("AND " : 1) ("D LE" : 1) ("E AN" : 1) ("ET L" : 1) ("IVE " : 1) ("LET " : 1) ("ND L" : 1) ("T LI" : 1) ("VE A" : 1)
Version 2 (Original order)
The iteration order of 'Map' objects in Wren is undefined though they can subsequently be sorted into a particular order as the first version shows. However, to maintain the original order of insertion we need to use one of the classes in the above module which automatically keep track of such order when items are added or removed.
import "./str" for Str
import "./ordered" for OrderedBag
import "./fmt" for Fmt
var findNgrams = Fn.new { |n, text|
text = Str.upper(text)
var ngrams = OrderedBag.new()
for (i in 0..text.count-n) {
var ngram = text[i...i+n]
ngrams.add(ngram)
}
return ngrams
}
var text = "Live and let live"
for (n in [2, 3, 4]) {
var ngrams = findNgrams.call(n, text)
System.print("All %(n)-grams of '%(text)' and their frequencies:")
var ng = ngrams.toList.map { |me| "(\"%(me.key)\" : %(me.value))"}
Fmt.tprint("$s ", ng, 5)
System.print()
}
- Output:
All 2-grams of 'Live and let live' and their frequencies: ("LI" : 2) ("IV" : 2) ("VE" : 2) ("E " : 1) (" A" : 1) ("AN" : 1) ("ND" : 1) ("D " : 1) (" L" : 2) ("LE" : 1) ("ET" : 1) ("T " : 1) All 3-grams of 'Live and let live' and their frequencies: ("LIV" : 2) ("IVE" : 2) ("VE " : 1) ("E A" : 1) (" AN" : 1) ("AND" : 1) ("ND " : 1) ("D L" : 1) (" LE" : 1) ("LET" : 1) ("ET " : 1) ("T L" : 1) (" LI" : 1) All 4-grams of 'Live and let live' and their frequencies: ("LIVE" : 2) ("IVE " : 1) ("VE A" : 1) ("E AN" : 1) (" AND" : 1) ("AND " : 1) ("ND L" : 1) ("D LE" : 1) (" LET" : 1) ("LET " : 1) ("ET L" : 1) ("T LI" : 1) (" LIV" : 1)
XPL0
int Dict(100), Count(100), Size;
proc LookUp(Wd); \Add word to dictionary, or increment its count
int Wd, I;
[for I:= 0 to Size-1 do
if Dict(I) = Wd then
[Count(I):= Count(I)+1;
return;
];
Dict(Size):= Wd;
Count(Size):= 1;
Size:= Size+1;
];
proc ShowNGram(N, Str); \Show N-grams for string
char N, Str;
int I, J, Wd, Ch;
[IntOut(0, N); Text(0, "-grams:^m^j");
Size:= 0; I:= 0;
loop [Wd:= 0;
for J:= 0 to N-1 do
[Ch:= Str(I+J);
if Ch = $A0 then quit; \terminating space
if Ch>=^a and Ch<=^z then Ch:= Ch & ~$20;
Wd:= Wd<<8 + Ch;
];
I:= I+1;
LookUp(Wd);
];
for I:= 0 to Size-1 do
[Wd:= Dict(I);
for J:= N-1 downto 0 do
ChOut(0, Wd>>(J*8));
ChOut(0, ^ );
IntOut(0, Count(I));
if rem(I/5) = 4 then CrLf(0) else ChOut(0, 9\tab\);
];
CrLf(0);
];
int N;
for N:= 2 to 4 do ShowNGram(N, "Live and let live ")
- Output:
2-grams: LI 2 IV 2 VE 2 E 1 A 1 AN 1 ND 1 D 1 L 2 LE 1 ET 1 T 1 3-grams: LIV 2 IVE 2 VE 1 E A 1 AN 1 AND 1 ND 1 D L 1 LE 1 LET 1 ET 1 T L 1 LI 1 4-grams: LIVE 2 IVE 1 VE A 1 E AN 1 AND 1 AND 1 ND L 1 D LE 1 LET 1 LET 1 ET L 1 T LI 1 LIV 1