N-grams: Difference between revisions

8,695 bytes added ,  2 months ago
→‎{{header|Python}}: Add simpler (and faster) implementation
m (Added missing {{out}})
(→‎{{header|Python}}: Add simpler (and faster) implementation)
(9 intermediate revisions by 4 users not shown)
Line 23:
;* [[Sorensen–Dice_coefficient|Related task: Sorensen–Dice coefficient]]
 
 
=={{header|ALGOL 68}}==
<syntaxhighlight lang="algol68">
BEGIN # split text into n-grams - n character substrings #
 
MODE NGRAM = STRUCT( STRING gram, INT count );
 
OP UCASE = ( CHAR c )CHAR: # return c converted to upper case #
IF c >= "a" AND c <= "z" THEN REPR( ( ABS c - ABS "a" ) + ABS "A" ) ELSE c FI;
OP UCASE = ( STRING s )STRING: # return s converted to upper case #
BEGIN
STRING uc := s;
FOR i FROM LWB uc TO UPB uc DO uc[ i ] := UCASE uc[ i ] OD;
uc
END # UCASE # ;
OP LENGTH = ( STRING s )INT: ( UPB s + 1 ) - LWB s;
 
# returns the n-grams of text - using a simple array to contain the #
# n-grams - for longer strings, an associative array might be better #
PRIO GRAMS = 1;
OP GRAMS = ( INT n, STRING text )[]NGRAM:
BEGIN
[ 1 : ( LENGTH text + 1 ) - n ]NGRAM result; # initially assume #
INT count := 0; # all the n-grams will be unique #
FOR pos FROM LWB text TO ( UPB text + 1 )- n DO
STRING ng = UCASE text[ pos : pos + ( n - 1 ) ];
BOOL found := FALSE;
INT g pos := 0;
FOR g FROM 1 TO count
WHILE g pos := g;
NOT ( found := ng = ( gram OF result )[ g ] )
DO SKIP OD;
IF NOT found THEN
result[ count +:= 1 ] := ( ng, 1 )
ELSE
( count OF result )[ g pos ] +:= 1
FI
OD;
result[ 1 : count ]
END # NGRAMS # ;
 
# prints the ngrams in ngrams #
PROC print ngrams = ( STRING title, text, []NGRAM ngrams )VOID:
BEGIN
print( ( title, "-grams of """, text, """:", newline ) );
FOR g FROM LWB ngrams TO UPB ngrams DO
print( ( " """, ( gram OF ngrams )[ g ] ) );
print( ( """: ", whole( ( count OF ngrams )[ g ], 0 ), newline ) )
OD
END # print ngrams # ;
 
STRING test = "Live and let live";
print ngrams( "bi", test, 2 GRAMS test );
print ngrams( "tri", test, 3 GRAMS test );
print ngrams( "quad", test, 4 GRAMS test )
 
END
</syntaxhighlight>
{{out}}
<pre>
bi-grams of "Live and let live":
"LI": 2
"IV": 2
"VE": 2
"E ": 1
" A": 1
"AN": 1
"ND": 1
"D ": 1
" L": 2
"LE": 1
"ET": 1
"T ": 1
tri-grams of "Live and let live":
"LIV": 2
"IVE": 2
"VE ": 1
"E A": 1
" AN": 1
"AND": 1
"ND ": 1
"D L": 1
" LE": 1
"LET": 1
"ET ": 1
"T L": 1
" LI": 1
quad-grams of "Live and let live":
"LIVE": 2
"IVE ": 1
"VE A": 1
"E AN": 1
" AND": 1
"AND ": 1
"ND L": 1
"D LE": 1
" LET": 1
"LET ": 1
"ET L": 1
"T LI": 1
" LIV": 1
</pre>
 
=={{header|Arturo}}==
Line 189 ⟶ 291:
</syntaxhighlight>
 
=={{header|F_Sharp|F#}}==
<syntaxhighlight lang="fsharp">
// N-grams. Nigel Galloway: April 2nd., 2024
let gram (n:string) g=let n=n.ToUpper() in n|>Seq.windowed g|>Seq.countBy id
for n,g in (gram "Live and let live" 2) do printfn "%A %d" n g
</syntaxhighlight>
{{out}}
<pre>
[|'L'; 'I'|] 2
[|'I'; 'V'|] 2
[|'V'; 'E'|] 2
[|'E'; ' '|] 1
[|' '; 'A'|] 1
[|'A'; 'N'|] 1
[|'N'; 'D'|] 1
[|'D'; ' '|] 1
[|' '; 'L'|] 2
[|'L'; 'E'|] 1
[|'E'; 'T'|] 1
[|'T'; ' '|] 1
</pre>
=={{header|Factor}}==
{{works with|Factor|0.99 2022-04-03}}
Line 213 ⟶ 336:
}
</pre>
 
=={{header|Haskell}}==
 
<syntaxhighlight lang=haskell>import Control.Applicative (ZipList (ZipList, getZipList))
import Data.Char (toUpper)
import Data.List (tails)
import qualified Data.Map.Strict as M
 
------------------- MAP OF N-GRAM COUNTS -----------------
 
nGramCounts :: Int -> String -> M.Map String Int
nGramCounts n =
foldr (flip (M.insertWith (+)) 1) M.empty . windows n
 
 
------------------------- GENERIC ------------------------
 
windows :: Int -> [a] -> [[a]]
windows n = transpose . take n . tails
 
transpose :: [[a]] -> [[a]]
transpose [] = []
transpose xs = getZipList (traverse ZipList xs)
 
 
--------------------------- TEST -------------------------
main :: IO ()
main =
let sample = toUpper <$> "Live and let live"
in mapM_
( \n ->
putStrLn (show n <> "-GRAMS:")
>> mapM_ print ((M.assocs . nGramCounts n) sample)
>> putStrLn ""
)
[0 .. 4]</syntaxhighlight>
{{Out}}
<pre>0-GRAMS:
 
1-GRAMS:
(" ",3)
("A",1)
("D",1)
("E",3)
("I",2)
("L",3)
("N",1)
("T",1)
("V",2)
 
2-GRAMS:
(" A",1)
(" L",2)
("AN",1)
("D ",1)
("E ",1)
("ET",1)
("IV",2)
("LE",1)
("LI",2)
("ND",1)
("T ",1)
("VE",2)
 
3-GRAMS:
(" AN",1)
(" LE",1)
(" LI",1)
("AND",1)
("D L",1)
("E A",1)
("ET ",1)
("IVE",2)
("LET",1)
("LIV",2)
("ND ",1)
("T L",1)
("VE ",1)
 
4-GRAMS:
(" AND",1)
(" LET",1)
(" LIV",1)
("AND ",1)
("D LE",1)
("E AN",1)
("ET L",1)
("IVE ",1)
("LET ",1)
("LIVE",2)
("ND L",1)
("T LI",1)
("VE A",1)</pre>
 
=={{header|jq}}==
Line 502 ⟶ 718:
=={{header|Python}}==
 
<syntaxhighlight lang="python">
This example generates n-grams lazily, much like the [https://docs.python.org/3/library/itertools.html#itertools-recipes sliding_window recipe] from the Python itertools docs.
import pprint
from collections import Counter
from typing import Iterable
 
 
def n_grams(text: str, n: int) -> Iterable[str]:
"""Generate contiguous sequences of _n_ characters from _text_."""
if n < 1:
raise ValueError("n must be an integer > 0")
 
text = text.upper()
return (text[i : (i + n)] for i in range(len(text) - n + 1))
 
 
def main() -> None:
example_text = "Live and let live"
 
for n in range(2, 5):
counts = Counter(n_grams(example_text, n)).most_common()
print(
f"{len(counts)} {n}-grams of {example_text!r}:\n",
pprint.pformat(counts, compact=True),
end="\n\n",
)
 
 
if __name__ == "__main__":
main()
</syntaxhighlight>
 
{{out}}
<pre>
12 2-grams of 'Live and let live':
[('LI', 2), ('IV', 2), ('VE', 2), (' L', 2), ('E ', 1), (' A', 1), ('AN', 1),
('ND', 1), ('D ', 1), ('LE', 1), ('ET', 1), ('T ', 1)]
 
13 3-grams of 'Live and let live':
[('LIV', 2), ('IVE', 2), ('VE ', 1), ('E A', 1), (' AN', 1), ('AND', 1),
('ND ', 1), ('D L', 1), (' LE', 1), ('LET', 1), ('ET ', 1), ('T L', 1),
(' LI', 1)]
 
13 4-grams of 'Live and let live':
[('LIVE', 2), ('IVE ', 1), ('VE A', 1), ('E AN', 1), (' AND', 1), ('AND ', 1),
('ND L', 1), ('D LE', 1), (' LET', 1), ('LET ', 1), ('ET L', 1), ('T LI', 1),
(' LIV', 1)]
</pre>
 
 
===Sliding window===
 
This example takes inspiration from the [https://docs.python.org/3/library/itertools.html#itertools-recipes sliding_window recipe] found in Python's itertools docs.
 
<syntaxhighlight lang="python">
import pprint
from collections import Counter
from collections import deque
from itertools import islice
from typing import Iterable
 
 
def n_grams(text: str, n: int) -> Iterable[str]:
"""Generate contiguous sequences of _n_ characters from _text_."""
it = iter(text.upper())
ngramn_gram = deque(islice(it, n), maxlen=n)
if len(ngramn_gram) == n:
yield "".join(ngramn_gram)
for chx in it:
ngramn_gram.append(chx)
yield "".join(ngramn_gram)
 
 
def main() -> None:
if __name__ == "__main__":
example_text = "Live and let live"
import pprint
 
example = "Live and let live"
 
for n in range(2, 5):
resultcounts = Counter(n_grams(exampleexample_text, n)).most_common()
print(
f"{len(resultcounts)} {n}-grams of {exampleexample_text!r}:\n",
pprint.pformat(resultcounts, compact=True),
end="\n\n",
)
 
 
if __name__ == "__main__":
main()
</syntaxhighlight>
 
Line 551 ⟶ 822:
(' LIV', 1)]
</pre>
 
 
And a strict variant, compositionally assembled from some basics:
 
<syntaxhighlight lang="python">from itertools import (islice)
from functools import (reduce)
from operator import (add)
 
 
def nGramCounts(n, s):
'''A dictionary of all nGrams of dimension n in s,
with the frequency of their occurrence.
'''
return reduce(
lambda a, gram: insertWith(add, gram, 1, a),
nGrams(n, s),
{}
)
 
 
def nGrams(n, s):
'''All case-insensitive sequences of length n in the string s.'''
return (''.join(t) for t in windows(n, list(s.upper())))
 
 
# ----------------------- GENERICS -----------------------
 
def insertWith(f, k, x, dct):
'''A new dictionary updated with a
(key, f(value, x)) tuple.
Where there is no existing value for the key,
the supplied x is used as the default.
'''
return dict(dct, **{k: f(dct[k], x) if k in dct else x})
 
 
def tails(xs):
'''All final segments of xs, longest first.'''
return (xs[i:] for i in range(0, 1 + len(xs)))
 
 
def windows(n, xs):
'''Sliding windows of dimension n.'''
return zip(*islice(tails(xs), n))
 
 
 
# ------------------------- TEST -------------------------
if __name__ == "__main__":
import pprint
 
EXAMPLE = "Live and let live"
 
for dimension in range(1, 5):
result = sorted(nGramCounts(dimension, EXAMPLE).items())
print(
f"{len(result)} {dimension}-grams of {EXAMPLE!r}:\n",
pprint.pformat(result),
end="\n\n",
)</syntaxhighlight>
{{Out}}
<pre>9 1-grams of 'Live and let live':
[(' ', 3),
('A', 1),
('D', 1),
('E', 3),
('I', 2),
('L', 3),
('N', 1),
('T', 1),
('V', 2)]
 
12 2-grams of 'Live and let live':
[(' A', 1),
(' L', 2),
('AN', 1),
('D ', 1),
('E ', 1),
('ET', 1),
('IV', 2),
('LE', 1),
('LI', 2),
('ND', 1),
('T ', 1),
('VE', 2)]
 
13 3-grams of 'Live and let live':
[(' AN', 1),
(' LE', 1),
(' LI', 1),
('AND', 1),
('D L', 1),
('E A', 1),
('ET ', 1),
('IVE', 2),
('LET', 1),
('LIV', 2),
('ND ', 1),
('T L', 1),
('VE ', 1)]
 
13 4-grams of 'Live and let live':
[(' AND', 1),
(' LET', 1),
(' LIV', 1),
('AND ', 1),
('D LE', 1),
('E AN', 1),
('ET L', 1),
('IVE ', 1),
('LET ', 1),
('LIVE', 2),
('ND L', 1),
('T LI', 1),
('VE A', 1)]</pre>
 
=={{header|Raku}}==
Line 616 ⟶ 1,002:
{{libheader|Wren-maputil}}
{{libheader|Wren-fmt}}
<syntaxhighlight lang="ecmascriptwren">import "./str" for Str
import "./maputil" for MultiSet
import "./fmt" for Fmt
Line 665 ⟶ 1,051:
{{libheader|Wren-ordered}}
The iteration order of 'Map' objects in Wren is undefined though they can subsequently be sorted into a particular order as the first version shows. However, to maintain the original order of insertion we need to use one of the classes in the above module which automatically keep track of such order when items are added or removed.
<syntaxhighlight lang="ecmascriptwren">import "./str" for Str
import "./ordered" for OrderedBag
import "./fmt" for Fmt
140

edits