N-grams: Difference between revisions

Content added Content deleted
(→‎{{header|Python}}: Add simpler (and faster) implementation)
Line 718: Line 718:
=={{header|Python}}==
=={{header|Python}}==


<syntaxhighlight lang="python">
This example generates n-grams lazily, much like the [https://docs.python.org/3/library/itertools.html#itertools-recipes sliding_window recipe] from the Python itertools docs.
import pprint
from collections import Counter
from typing import Iterable


def n_grams(text: str, n: int) -> Iterable[str]:
"""Generate contiguous sequences of _n_ characters from _text_."""
if n < 1:
raise ValueError("n must be an integer > 0")

text = text.upper()
return (text[i : (i + n)] for i in range(len(text) - n + 1))


def main() -> None:
example_text = "Live and let live"

for n in range(2, 5):
counts = Counter(n_grams(example_text, n)).most_common()
print(
f"{len(counts)} {n}-grams of {example_text!r}:\n",
pprint.pformat(counts, compact=True),
end="\n\n",
)


if __name__ == "__main__":
main()
</syntaxhighlight>

{{out}}
<pre>
12 2-grams of 'Live and let live':
[('LI', 2), ('IV', 2), ('VE', 2), (' L', 2), ('E ', 1), (' A', 1), ('AN', 1),
('ND', 1), ('D ', 1), ('LE', 1), ('ET', 1), ('T ', 1)]

13 3-grams of 'Live and let live':
[('LIV', 2), ('IVE', 2), ('VE ', 1), ('E A', 1), (' AN', 1), ('AND', 1),
('ND ', 1), ('D L', 1), (' LE', 1), ('LET', 1), ('ET ', 1), ('T L', 1),
(' LI', 1)]

13 4-grams of 'Live and let live':
[('LIVE', 2), ('IVE ', 1), ('VE A', 1), ('E AN', 1), (' AND', 1), ('AND ', 1),
('ND L', 1), ('D LE', 1), (' LET', 1), ('LET ', 1), ('ET L', 1), ('T LI', 1),
(' LIV', 1)]
</pre>


===Sliding window===

This example takes inspiration from the [https://docs.python.org/3/library/itertools.html#itertools-recipes sliding_window recipe] found in Python's itertools docs.


<syntaxhighlight lang="python">
<syntaxhighlight lang="python">
import pprint
from collections import Counter
from collections import Counter
from collections import deque
from collections import deque
from itertools import islice
from itertools import islice
from typing import Iterable




def n_grams(text, n):
def n_grams(text: str, n: int) -> Iterable[str]:
"""Generate contiguous sequences of _n_ characters from _text_."""
"""Generate contiguous sequences of _n_ characters from _text_."""
it = iter(text.upper())
it = iter(text.upper())
ngram = deque(islice(it, n), maxlen=n)
n_gram = deque(islice(it, n), maxlen=n)
if len(ngram) == n:
if len(n_gram) == n:
yield "".join(ngram)
yield "".join(n_gram)
for ch in it:
for x in it:
ngram.append(ch)
n_gram.append(x)
yield "".join(ngram)
yield "".join(n_gram)




def main() -> None:
if __name__ == "__main__":
example_text = "Live and let live"
import pprint

example = "Live and let live"


for n in range(2, 5):
for n in range(2, 5):
result = Counter(n_grams(example, n)).most_common()
counts = Counter(n_grams(example_text, n)).most_common()
print(
print(
f"{len(result)} {n}-grams of {example!r}:\n",
f"{len(counts)} {n}-grams of {example_text!r}:\n",
pprint.pformat(result, compact=True),
pprint.pformat(counts, compact=True),
end="\n\n",
end="\n\n",
)
)


if __name__ == "__main__":
main()
</syntaxhighlight>
</syntaxhighlight>