Jump to content

N-grams: Difference between revisions

1,470 bytes added ,  2 months ago
→‎{{header|Python}}: Add simpler (and faster) implementation
(→‎{{header|Python}}: Add simpler (and faster) implementation)
Line 718:
=={{header|Python}}==
 
<syntaxhighlight lang="python">
This example generates n-grams lazily, much like the [https://docs.python.org/3/library/itertools.html#itertools-recipes sliding_window recipe] from the Python itertools docs.
import pprint
from collections import Counter
from typing import Iterable
 
 
def n_grams(text: str, n: int) -> Iterable[str]:
"""Generate contiguous sequences of _n_ characters from _text_."""
if n < 1:
raise ValueError("n must be an integer > 0")
 
text = text.upper()
return (text[i : (i + n)] for i in range(len(text) - n + 1))
 
 
def main() -> None:
exampleexample_text = "Live and let live"
 
for n in range(2, 5):
counts = Counter(n_grams(example_text, n)).most_common()
print(
f"{len(counts)} {n}-grams of {example_text!r}:\n",
pprint.pformat(counts, compact=True),
end="\n\n",
)
 
 
if __name__ == "__main__":
main()
</syntaxhighlight>
 
{{out}}
<pre>
12 2-grams of 'Live and let live':
[('LI', 2), ('IV', 2), ('VE', 2), (' L', 2), ('E ', 1), (' A', 1), ('AN', 1),
('ND', 1), ('D ', 1), ('LE', 1), ('ET', 1), ('T ', 1)]
 
13 3-grams of 'Live and let live':
[('LIV', 2), ('IVE', 2), ('VE ', 1), ('E A', 1), (' AN', 1), ('AND', 1),
('ND ', 1), ('D L', 1), (' LE', 1), ('LET', 1), ('ET ', 1), ('T L', 1),
(' LI', 1)]
 
13 4-grams of 'Live and let live':
[('LIVE', 2), ('IVE ', 1), ('VE A', 1), ('E AN', 1), (' AND', 1), ('AND ', 1),
('ND L', 1), ('D LE', 1), (' LET', 1), ('LET ', 1), ('ET L', 1), ('T LI', 1),
(' LIV', 1)]
</pre>
 
 
===Sliding window===
 
This example generatestakes n-gramsinspiration lazily, much likefrom the [https://docs.python.org/3/library/itertools.html#itertools-recipes sliding_window recipe] fromfound thein Python's itertools docs.
 
<syntaxhighlight lang="python">
import pprint
from collections import Counter
from collections import deque
from itertools import islice
from typing import Iterable
 
 
def n_grams(text: str, n: int) -> Iterable[str]:
"""Generate contiguous sequences of _n_ characters from _text_."""
it = iter(text.upper())
ngramn_gram = deque(islice(it, n), maxlen=n)
if len(ngramn_gram) == n:
yield "".join(ngramn_gram)
for chx in it:
ngramn_gram.append(chx)
yield "".join(ngramn_gram)
 
 
def main() -> None:
if __name__ == "__main__":
example_text = "Live and let live"
import pprint
 
example = "Live and let live"
 
for n in range(2, 5):
resultcounts = Counter(n_grams(exampleexample_text, n)).most_common()
print(
f"{len(resultcounts)} {n}-grams of {exampleexample_text!r}:\n",
pprint.pformat(resultcounts, compact=True),
end="\n\n",
)
 
 
if __name__ == "__main__":
main()
</syntaxhighlight>
 
144

edits

Cookies help us deliver our services. By using our services, you agree to our use of cookies.