N-grams: Difference between revisions

Content added Content deleted
(→‎{{header|Python}}: Add simpler (and faster) implementation)
Line 718:
=={{header|Python}}==
 
<syntaxhighlight lang="python">
This example generates n-grams lazily, much like the [https://docs.python.org/3/library/itertools.html#itertools-recipes sliding_window recipe] from the Python itertools docs.
import pprint
from collections import Counter
from typing import Iterable
 
 
def n_grams(text: str, n: int) -> Iterable[str]:
"""Generate contiguous sequences of _n_ characters from _text_."""
if n < 1:
raise ValueError("n must be an integer > 0")
 
text = text.upper()
return (text[i : (i + n)] for i in range(len(text) - n + 1))
 
 
def main() -> None:
exampleexample_text = "Live and let live"
 
for n in range(2, 5):
counts = Counter(n_grams(example_text, n)).most_common()
print(
f"{len(counts)} {n}-grams of {example_text!r}:\n",
pprint.pformat(counts, compact=True),
end="\n\n",
)
 
 
if __name__ == "__main__":
main()
</syntaxhighlight>
 
{{out}}
<pre>
12 2-grams of 'Live and let live':
[('LI', 2), ('IV', 2), ('VE', 2), (' L', 2), ('E ', 1), (' A', 1), ('AN', 1),
('ND', 1), ('D ', 1), ('LE', 1), ('ET', 1), ('T ', 1)]
 
13 3-grams of 'Live and let live':
[('LIV', 2), ('IVE', 2), ('VE ', 1), ('E A', 1), (' AN', 1), ('AND', 1),
('ND ', 1), ('D L', 1), (' LE', 1), ('LET', 1), ('ET ', 1), ('T L', 1),
(' LI', 1)]
 
13 4-grams of 'Live and let live':
[('LIVE', 2), ('IVE ', 1), ('VE A', 1), ('E AN', 1), (' AND', 1), ('AND ', 1),
('ND L', 1), ('D LE', 1), (' LET', 1), ('LET ', 1), ('ET L', 1), ('T LI', 1),
(' LIV', 1)]
</pre>
 
 
===Sliding window===
 
This example generatestakes n-gramsinspiration lazily, much likefrom the [https://docs.python.org/3/library/itertools.html#itertools-recipes sliding_window recipe] fromfound thein Python's itertools docs.
 
<syntaxhighlight lang="python">
import pprint
from collections import Counter
from collections import deque
from itertools import islice
from typing import Iterable
 
 
def n_grams(text: str, n: int) -> Iterable[str]:
"""Generate contiguous sequences of _n_ characters from _text_."""
it = iter(text.upper())
ngramn_gram = deque(islice(it, n), maxlen=n)
if len(ngramn_gram) == n:
yield "".join(ngramn_gram)
for chx in it:
ngramn_gram.append(chx)
yield "".join(ngramn_gram)
 
 
def main() -> None:
if __name__ == "__main__":
example_text = "Live and let live"
import pprint
 
example = "Live and let live"
 
for n in range(2, 5):
resultcounts = Counter(n_grams(exampleexample_text, n)).most_common()
print(
f"{len(resultcounts)} {n}-grams of {exampleexample_text!r}:\n",
pprint.pformat(resultcounts, compact=True),
end="\n\n",
)
 
 
if __name__ == "__main__":
main()
</syntaxhighlight>