N-grams: Difference between revisions
Content added Content deleted
(→{{header|Python}}: Add simpler (and faster) implementation) |
|||
Line 718: | Line 718: | ||
=={{header|Python}}== |
=={{header|Python}}== |
||
<syntaxhighlight lang="python"> |
|||
⚫ | |||
⚫ | |||
from collections import Counter |
|||
from typing import Iterable |
|||
def n_grams(text: str, n: int) -> Iterable[str]: |
|||
"""Generate contiguous sequences of _n_ characters from _text_.""" |
|||
if n < 1: |
|||
raise ValueError("n must be an integer > 0") |
|||
text = text.upper() |
|||
return (text[i : (i + n)] for i in range(len(text) - n + 1)) |
|||
def main() -> None: |
|||
⚫ | |||
for n in range(2, 5): |
|||
counts = Counter(n_grams(example_text, n)).most_common() |
|||
print( |
|||
f"{len(counts)} {n}-grams of {example_text!r}:\n", |
|||
pprint.pformat(counts, compact=True), |
|||
end="\n\n", |
|||
) |
|||
⚫ | |||
main() |
|||
</syntaxhighlight> |
|||
{{out}} |
|||
<pre> |
|||
12 2-grams of 'Live and let live': |
|||
[('LI', 2), ('IV', 2), ('VE', 2), (' L', 2), ('E ', 1), (' A', 1), ('AN', 1), |
|||
('ND', 1), ('D ', 1), ('LE', 1), ('ET', 1), ('T ', 1)] |
|||
13 3-grams of 'Live and let live': |
|||
[('LIV', 2), ('IVE', 2), ('VE ', 1), ('E A', 1), (' AN', 1), ('AND', 1), |
|||
('ND ', 1), ('D L', 1), (' LE', 1), ('LET', 1), ('ET ', 1), ('T L', 1), |
|||
(' LI', 1)] |
|||
13 4-grams of 'Live and let live': |
|||
[('LIVE', 2), ('IVE ', 1), ('VE A', 1), ('E AN', 1), (' AND', 1), ('AND ', 1), |
|||
('ND L', 1), ('D LE', 1), (' LET', 1), ('LET ', 1), ('ET L', 1), ('T LI', 1), |
|||
(' LIV', 1)] |
|||
</pre> |
|||
===Sliding window=== |
|||
⚫ | |||
<syntaxhighlight lang="python"> |
<syntaxhighlight lang="python"> |
||
import pprint |
|||
from collections import Counter |
from collections import Counter |
||
from collections import deque |
from collections import deque |
||
from itertools import islice |
from itertools import islice |
||
from typing import Iterable |
|||
def n_grams(text, n): |
def n_grams(text: str, n: int) -> Iterable[str]: |
||
"""Generate contiguous sequences of _n_ characters from _text_.""" |
"""Generate contiguous sequences of _n_ characters from _text_.""" |
||
it = iter(text.upper()) |
it = iter(text.upper()) |
||
n_gram = deque(islice(it, n), maxlen=n) |
|||
if len( |
if len(n_gram) == n: |
||
yield "".join( |
yield "".join(n_gram) |
||
for |
for x in it: |
||
n_gram.append(x) |
|||
yield "".join( |
yield "".join(n_gram) |
||
def main() -> None: |
|||
⚫ | |||
example_text = "Live and let live" |
|||
⚫ | |||
⚫ | |||
for n in range(2, 5): |
for n in range(2, 5): |
||
counts = Counter(n_grams(example_text, n)).most_common() |
|||
print( |
print( |
||
f"{len( |
f"{len(counts)} {n}-grams of {example_text!r}:\n", |
||
pprint.pformat( |
pprint.pformat(counts, compact=True), |
||
end="\n\n", |
end="\n\n", |
||
) |
) |
||
if __name__ == "__main__": |
|||
main() |
|||
</syntaxhighlight> |
</syntaxhighlight> |
||