N-grams: Difference between revisions

N-grams (view source)

Revision as of 14:24, 8 April 2024

8,695 bytes added , 2 months ago

→‎{{header|Python}}: Add simpler (and faster) implementation

Jgrprior

140

edits

Revision as of 19:04, 12 June 2023 (view source) Lscrd (talk \| contribs) m (Added missing {{out}}) ← Older edit		Revision as of 14:24, 8 April 2024 (view source) Jgrprior (talk \| contribs) (→‎{{header\|Python}}: Add simpler (and faster) implementation) Newer edit →
(9 intermediate revisions by 4 users not shown)
Line 23: ;* [[Sorensen–Dice_coefficient\|Related task: Sorensen–Dice coefficient]] =={{header\|ALGOL 68}}== <syntaxhighlight lang="algol68"> BEGIN # split text into n-grams - n character substrings # MODE NGRAM = STRUCT( STRING gram, INT count ); OP UCASE = ( CHAR c )CHAR: # return c converted to upper case # IF c >= "a" AND c <= "z" THEN REPR( ( ABS c - ABS "a" ) + ABS "A" ) ELSE c FI; OP UCASE = ( STRING s )STRING: # return s converted to upper case # BEGIN STRING uc := s; FOR i FROM LWB uc TO UPB uc DO uc[ i ] := UCASE uc[ i ] OD; uc END # UCASE # ; OP LENGTH = ( STRING s )INT: ( UPB s + 1 ) - LWB s; # returns the n-grams of text - using a simple array to contain the # # n-grams - for longer strings, an associative array might be better # PRIO GRAMS = 1; OP GRAMS = ( INT n, STRING text )[]NGRAM: BEGIN [ 1 : ( LENGTH text + 1 ) - n ]NGRAM result; # initially assume # INT count := 0; # all the n-grams will be unique # FOR pos FROM LWB text TO ( UPB text + 1 )- n DO STRING ng = UCASE text[ pos : pos + ( n - 1 ) ]; BOOL found := FALSE; INT g pos := 0; FOR g FROM 1 TO count WHILE g pos := g; NOT ( found := ng = ( gram OF result )[ g ] ) DO SKIP OD; IF NOT found THEN result[ count +:= 1 ] := ( ng, 1 ) ELSE ( count OF result )[ g pos ] +:= 1 FI OD; result[ 1 : count ] END # NGRAMS # ; # prints the ngrams in ngrams # PROC print ngrams = ( STRING title, text, []NGRAM ngrams )VOID: BEGIN print( ( title, "-grams of """, text, """:", newline ) ); FOR g FROM LWB ngrams TO UPB ngrams DO print( ( " """, ( gram OF ngrams )[ g ] ) ); print( ( """: ", whole( ( count OF ngrams )[ g ], 0 ), newline ) ) OD END # print ngrams # ; STRING test = "Live and let live"; print ngrams( "bi", test, 2 GRAMS test ); print ngrams( "tri", test, 3 GRAMS test ); print ngrams( "quad", test, 4 GRAMS test ) END </syntaxhighlight> {{out}} <pre> bi-grams of "Live and let live": "LI": 2 "IV": 2 "VE": 2 "E ": 1 " A": 1 "AN": 1 "ND": 1 "D ": 1 " L": 2 "LE": 1 "ET": 1 "T ": 1 tri-grams of "Live and let live": "LIV": 2 "IVE": 2 "VE ": 1 "E A": 1 " AN": 1 "AND": 1 "ND ": 1 "D L": 1 " LE": 1 "LET": 1 "ET ": 1 "T L": 1 " LI": 1 quad-grams of "Live and let live": "LIVE": 2 "IVE ": 1 "VE A": 1 "E AN": 1 " AND": 1 "AND ": 1 "ND L": 1 "D LE": 1 " LET": 1 "LET ": 1 "ET L": 1 "T LI": 1 " LIV": 1 </pre> =={{header\|Arturo}}== Line 189 ⟶ 291: </syntaxhighlight> =={{header\|F_Sharp\|F#}}== <syntaxhighlight lang="fsharp"> // N-grams. Nigel Galloway: April 2nd., 2024 let gram (n:string) g=let n=n.ToUpper() in n\|>Seq.windowed g\|>Seq.countBy id for n,g in (gram "Live and let live" 2) do printfn "%A %d" n g </syntaxhighlight> {{out}} <pre> [\|'L'; 'I'\|] 2 [\|'I'; 'V'\|] 2 [\|'V'; 'E'\|] 2 [\|'E'; ' '\|] 1 [\|' '; 'A'\|] 1 [\|'A'; 'N'\|] 1 [\|'N'; 'D'\|] 1 [\|'D'; ' '\|] 1 [\|' '; 'L'\|] 2 [\|'L'; 'E'\|] 1 [\|'E'; 'T'\|] 1 [\|'T'; ' '\|] 1 </pre> =={{header\|Factor}}== {{works with\|Factor\|0.99 2022-04-03}} Line 213 ⟶ 336: } </pre> =={{header\|Haskell}}== <syntaxhighlight lang=haskell>import Control.Applicative (ZipList (ZipList, getZipList)) import Data.Char (toUpper) import Data.List (tails) import qualified Data.Map.Strict as M ------------------- MAP OF N-GRAM COUNTS ----------------- nGramCounts :: Int -> String -> M.Map String Int nGramCounts n = foldr (flip (M.insertWith (+)) 1) M.empty . windows n ------------------------- GENERIC ------------------------ windows :: Int -> [a] -> [[a]] windows n = transpose . take n . tails transpose :: [[a]] -> [[a]] transpose [] = [] transpose xs = getZipList (traverse ZipList xs) --------------------------- TEST ------------------------- main :: IO () main = let sample = toUpper <$> "Live and let live" in mapM_ ( \n -> putStrLn (show n <> "-GRAMS:") >> mapM_ print ((M.assocs . nGramCounts n) sample) >> putStrLn "" ) [0 .. 4]</syntaxhighlight> {{Out}} <pre>0-GRAMS: 1-GRAMS: (" ",3) ("A",1) ("D",1) ("E",3) ("I",2) ("L",3) ("N",1) ("T",1) ("V",2) 2-GRAMS: (" A",1) (" L",2) ("AN",1) ("D ",1) ("E ",1) ("ET",1) ("IV",2) ("LE",1) ("LI",2) ("ND",1) ("T ",1) ("VE",2) 3-GRAMS: (" AN",1) (" LE",1) (" LI",1) ("AND",1) ("D L",1) ("E A",1) ("ET ",1) ("IVE",2) ("LET",1) ("LIV",2) ("ND ",1) ("T L",1) ("VE ",1) 4-GRAMS: (" AND",1) (" LET",1) (" LIV",1) ("AND ",1) ("D LE",1) ("E AN",1) ("ET L",1) ("IVE ",1) ("LET ",1) ("LIVE",2) ("ND L",1) ("T LI",1) ("VE A",1)</pre> =={{header\|jq}}== Line 502 ⟶ 718: =={{header\|Python}}== <syntaxhighlight lang="python"> ~~This example generates n-grams lazily, much like the [https://docs.python.org/3/library/itertools.html#itertools-recipes sliding_window recipe] from the Python itertools docs.~~ import pprint from collections import Counter from typing import Iterable def n_grams(text: str, n: int) -> Iterable[str]: """Generate contiguous sequences of _n_ characters from _text_.""" if n < 1: raise ValueError("n must be an integer > 0") text = text.upper() return (text[i : (i + n)] for i in range(len(text) - n + 1)) def main() -> None: example_text = "Live and let live" for n in range(2, 5): counts = Counter(n_grams(example_text, n)).most_common() print( f"{len(counts)} {n}-grams of {example_text!r}:\n", pprint.pformat(counts, compact=True), end="\n\n", ) if __name__ == "__main__": main() </syntaxhighlight> {{out}} <pre> 12 2-grams of 'Live and let live': [('LI', 2), ('IV', 2), ('VE', 2), (' L', 2), ('E ', 1), (' A', 1), ('AN', 1), ('ND', 1), ('D ', 1), ('LE', 1), ('ET', 1), ('T ', 1)] 13 3-grams of 'Live and let live': [('LIV', 2), ('IVE', 2), ('VE ', 1), ('E A', 1), (' AN', 1), ('AND', 1), ('ND ', 1), ('D L', 1), (' LE', 1), ('LET', 1), ('ET ', 1), ('T L', 1), (' LI', 1)] 13 4-grams of 'Live and let live': [('LIVE', 2), ('IVE ', 1), ('VE A', 1), ('E AN', 1), (' AND', 1), ('AND ', 1), ('ND L', 1), ('D LE', 1), (' LET', 1), ('LET ', 1), ('ET L', 1), ('T LI', 1), (' LIV', 1)] </pre> ===Sliding window=== This example takes inspiration from the [https://docs.python.org/3/library/itertools.html#itertools-recipes sliding_window recipe] found in Python's itertools docs. <syntaxhighlight lang="python"> import pprint from collections import Counter from collections import deque from itertools import islice from typing import Iterable def n_grams(text: str, n: int) -> Iterable[str]: """Generate contiguous sequences of _n_ characters from _text_.""" it = iter(text.upper()) ~~ngram~~n_gram = deque(islice(it, n), maxlen=n) if len(~~ngram~~n_gram) == n: yield "".join(~~ngram~~n_gram) for chx in it: ~~ngram~~n_gram.append(chx) yield "".join(~~ngram~~n_gram) def main() -> None: ~~if __name__ == "__main__":~~ example_text = "Live and let live" ~~import pprint~~ ~~example = "Live and let live"~~ for n in range(2, 5): ~~result~~counts = Counter(n_grams(~~example~~example_text, n)).most_common() print( f"{len(~~result~~counts)} {n}-grams of {~~example~~example_text!r}:\n", pprint.pformat(~~result~~counts, compact=True), end="\n\n", ) if __name__ == "__main__": main() </syntaxhighlight> Line 551 ⟶ 822: (' LIV', 1)] </pre> And a strict variant, compositionally assembled from some basics: <syntaxhighlight lang="python">from itertools import (islice) from functools import (reduce) from operator import (add) def nGramCounts(n, s): '''A dictionary of all nGrams of dimension n in s, with the frequency of their occurrence. ''' return reduce( lambda a, gram: insertWith(add, gram, 1, a), nGrams(n, s), {} ) def nGrams(n, s): '''All case-insensitive sequences of length n in the string s.''' return (''.join(t) for t in windows(n, list(s.upper()))) # ----------------------- GENERICS ----------------------- def insertWith(f, k, x, dct): '''A new dictionary updated with a (key, f(value, x)) tuple. Where there is no existing value for the key, the supplied x is used as the default. ''' return dict(dct, *{k: f(dct[k], x) if k in dct else x}) def tails(xs): '''All final segments of xs, longest first.''' return (xs[i:] for i in range(0, 1 + len(xs))) def windows(n, xs): '''Sliding windows of dimension n.''' return zip(islice(tails(xs), n)) # ------------------------- TEST ------------------------- if __name__ == "__main__": import pprint EXAMPLE = "Live and let live" for dimension in range(1, 5): result = sorted(nGramCounts(dimension, EXAMPLE).items()) print( f"{len(result)} {dimension}-grams of {EXAMPLE!r}:\n", pprint.pformat(result), end="\n\n", )</syntaxhighlight> {{Out}} <pre>9 1-grams of 'Live and let live': [(' ', 3), ('A', 1), ('D', 1), ('E', 3), ('I', 2), ('L', 3), ('N', 1), ('T', 1), ('V', 2)] 12 2-grams of 'Live and let live': [(' A', 1), (' L', 2), ('AN', 1), ('D ', 1), ('E ', 1), ('ET', 1), ('IV', 2), ('LE', 1), ('LI', 2), ('ND', 1), ('T ', 1), ('VE', 2)] 13 3-grams of 'Live and let live': [(' AN', 1), (' LE', 1), (' LI', 1), ('AND', 1), ('D L', 1), ('E A', 1), ('ET ', 1), ('IVE', 2), ('LET', 1), ('LIV', 2), ('ND ', 1), ('T L', 1), ('VE ', 1)] 13 4-grams of 'Live and let live': [(' AND', 1), (' LET', 1), (' LIV', 1), ('AND ', 1), ('D LE', 1), ('E AN', 1), ('ET L', 1), ('IVE ', 1), ('LET ', 1), ('LIVE', 2), ('ND L', 1), ('T LI', 1), ('VE A', 1)]</pre> =={{header\|Raku}}== Line 616 ⟶ 1,002: {{libheader\|Wren-maputil}} {{libheader\|Wren-fmt}} <syntaxhighlight lang="~~ecmascript~~wren">import "./str" for Str import "./maputil" for MultiSet import "./fmt" for Fmt Line 665 ⟶ 1,051: {{libheader\|Wren-ordered}} The iteration order of 'Map' objects in Wren is undefined though they can subsequently be sorted into a particular order as the first version shows. However, to maintain the original order of insertion we need to use one of the classes in the above module which automatically keep track of such order when items are added or removed. <syntaxhighlight lang="~~ecmascript~~wren">import "./str" for Str import "./ordered" for OrderedBag import "./fmt" for Fmt