Rosetta Code/Find bare lang tags: Difference between revisions

Content added Content deleted
m (Fix Perl 6 -> Raku in comments)
(Added a Python implementation)
Line 1,286: Line 1,286:
4DOS Batch 1 (100_doors)
4DOS Batch 1 (100_doors)
Total: 1094
Total: 1094
</pre>

=={{header|Python}}==
<lang python>
"""Count bare `lang` tags in wiki markup. Requires Python >=3.6.

Uses the Python standard library `urllib` to make MediaWiki API requests.
"""

from __future__ import annotations

import functools
import gzip
import json
import logging
import platform
import re

from collections import Counter
from collections import defaultdict

from typing import Any
from typing import Iterator
from typing import Iterable
from typing import List
from typing import Mapping
from typing import NamedTuple
from typing import Optional
from typing import Tuple

from urllib.parse import urlencode
from urllib.parse import urlunparse
from urllib.parse import quote_plus

import urllib.error
import urllib.request

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


# Parse wiki markup with these regular expression patterns. Any headings and
# `lang` tags found inside `nowiki`, `pre` or other `lang` tags (bare or not)
# should not count as "bare".
#
# NOTE: The order of these patterns is significant.
RE_SPEC = [
("NOWIKI", r"<\s*nowiki\s*>.*?</\s*nowiki\s*>"),
("PRE", r"<\s*pre\s*>.*?</\s*pre\s*>"),
("LANG", r"<\s*lang\s+.+?>.*?</\s*lang\s*>"),
("HEAD", r"==\{\{\s*header\s*\|\s*(?P<header>.+?)\s*}}=="),
("BARE", r"<\s*lang\s*>.*?</\s*lang\s*>"),
]

RE_BARE_LANG = re.compile(
"|".join(rf"(?P<{name}>{pattern})" for name, pattern in RE_SPEC),
re.DOTALL | re.IGNORECASE,
)

# Some wiki headings look like this "=={{header|Some}} / {{header|Other}}==".
# We'll use this regular expression to strip out the markup.
RE_MULTI_HEADER = re.compile(r"(}|(\{\{\s*header\s*\|\s*))", re.IGNORECASE)


def find_bare_lang_section_headers(wiki_text: str) -> Iterator[str]:
"""Generate a sequence of wiki section headings that contain bare
'lang' tags.

If there are multiple bare lang tags in a section, that section
heading will appear multiple times in the sequence.
"""
current_heading = "no language"

for match in RE_BARE_LANG.finditer(wiki_text):
kind = match.lastgroup

if kind == "HEAD":
current_heading = RE_MULTI_HEADER.sub("", match.group("header"))
elif kind == "BARE":
yield current_heading


class Error(Exception):
"""Exception raised when we get an unexpected response from the MediaWiki API."""


class TagCounter:
"""Count bare `lang` tags in wiki markup. Group them by heading and
remember what page they're in."""

def __init__(self):
self.counter = Counter()
self.pages = defaultdict(set)
self.total = 0

def __len__(self):
return len(self.counter)

@classmethod
def from_section_headers(
cls, page_title: str, section_headers: Iterable[str]
) -> TagCounter:
"""Return a new `TagCounter` initialized with the given section
headings."""
counter = cls()

for heading in section_headers:
counter.add(page_title, heading)

return counter

@classmethod
def from_wiki_text(cls, page_title: str, wiki_text: str) -> TagCounter:
"""Return a new `TagCounter` initialized with bare lang tags from the
given wiki text."""
return cls.from_section_headers(
page_title,
find_bare_lang_section_headers(wiki_text),
)

def add(self, page_title: str, section_heading: str):
"""Increment the counter by one for the given section heading an
page."""
self.counter[section_heading] += 1
self.pages[section_heading].add(page_title)
self.total += 1

def update(self, other):
"""Union this counter with `other`, another counter."""
assert isinstance(other, TagCounter)

self.counter.update(other.counter)

for section_heading, pages in other.pages.items():
self.pages[section_heading].update(pages)

self.total += other.total

def most_common(self, n=None) -> str:
"""Return a formatted string of the most common wiki sections to have
bare lang tags."""
buf = [f"{sum(self.counter.values())} bare lang tags.\n"]

for section_heading, count in self.counter.most_common(n=n):
pages = list(self.pages[section_heading])
buf.append(f"{count} in {section_heading} {pages}")

return "\n".join(buf)


def quote_underscore(string, safe="", encoding=None, errors=None):
"""Like urllib.parse.quote but replaces spaces with underscores."""
string = quote_plus(string, safe, encoding, errors)
return string.replace("+", "_")


class URL(NamedTuple):
"""A `urllib.parse.urlunparse` compatible Tuple with some helper methods.
We'll use this to build and pass around our MediaWiki API URLs.
"""

scheme: str
netloc: str
path: str
params: str
query: str
fragment: str

def __str__(self):
return urlunparse(self)

def with_query(self, query: Mapping[str, Any]) -> URL:
query_string = urlencode(query, safe=":", quote_via=quote_underscore)
return self._replace(query=query_string)


API_BASE_URL = URL(
scheme="http",
netloc="rosettacode.org",
path="/mw/api.php",
params="",
query="",
fragment="",
)

UGLY_RAW_URL = URL(
scheme="http",
netloc="rosettacode.org",
path="/mw/index.php",
params="",
query="",
fragment="",
)

# NOTE: Cloudflare was blocking requests with the default user agent.
DEFAULT_HEADERS = {
"User-agent": f"python/{platform.python_version()}",
"Accept-encoding": "gzip, deflate",
"Accept": "*/*",
"Connection": "keep-alive",
}


class Response(NamedTuple):
headers: Mapping[str, str]
body: bytes


def get(url: URL, headers=DEFAULT_HEADERS) -> Response:
"""Make an HTTP GET request to the given URL."""
logger.debug(f"GET {url}")
request = urllib.request.Request(str(url), headers=headers)

try:
with urllib.request.urlopen(request) as response:
return Response(
headers=dict(response.getheaders()),
body=response.read(),
)
except urllib.error.HTTPError as e:
logging.debug(e.code)
logging.debug(gzip.decompress(e.read()))
raise


def raise_for_header(headers: Mapping[str, str], header: str, expect: str):
got = headers.get(header)
if got != expect:
raise Error(f"expected '{expect}', got '{got}'")


raise_for_content_type = functools.partial(raise_for_header, header="Content-Type")


class CMContinue(NamedTuple):
continue_: str
cmcontinue: str


Pages = Tuple[List[str], Optional[CMContinue]]


def get_wiki_page_titles(chunk_size: int = 500, continue_: CMContinue = None) -> Pages:
"""Return a list of wiki page titles and any continuation information."""
query = {
"action": "query",
"list": "categorymembers",
"cmtitle": "Category:Programming_Tasks",
"cmlimit": chunk_size,
"format": "json",
"continue": "",
}

if continue_:
query["continue"] = continue_.continue_
query["cmcontinue"] = continue_.cmcontinue

response = get(API_BASE_URL.with_query(query))

# Fail early if the response is not what we are expecting.
raise_for_content_type(response.headers, expect="application/json; charset=utf-8")
raise_for_header(response.headers, "Content-Encoding", "gzip")

data = json.loads(gzip.decompress(response.body))
page_titles = [p["title"] for p in data["query"]["categorymembers"]]

if data.get("continue", {}).get("cmcontinue"):
_continue = CMContinue(
data["continue"]["continue"],
data["continue"]["cmcontinue"],
)
else:
_continue = None

return (page_titles, _continue)


def get_wiki_page_markup(page_title: str) -> str:
"""Return raw MediaWiki markup from the page `page_title`."""
query = {"action": "raw", "title": page_title}
response = get(UGLY_RAW_URL.with_query(query))

# Fail early if the response is not what we are expecting.
raise_for_content_type(response.headers, expect="text/x-wiki; charset=UTF-8")

return response.body.decode()


def example(limit=30):
# Get the first chunk of wiki page titles from the MediaWiki API
page_titles, continue_ = get_wiki_page_titles()

# Get more chunks if there are any.
while continue_ is not None:
more_page_titles, continue_ = get_wiki_page_titles(continue_=continue_)
page_titles.extend(more_page_titles)

# Aggregate counts from all pages.
counter = TagCounter()

for i, page_title in enumerate(page_titles):
if i > limit:
break

# Read and parse raw wiki page markup.
wiki_text = get_wiki_page_markup(page_title)
counts = TagCounter.from_wiki_text(page_title, wiki_text)
counter.update(counts)

# Dump the results to stdout.
print(counter.most_common())


if __name__ == "__main__":
logging.basicConfig(format="%(asctime)s %(message)s", level=logging.DEBUG)
example()
</lang>

{{out}}
Limited to the first 30 wiki pages.
<pre>
44 bare lang tags.

5 in EasyLang ['15 Puzzle Game', '100 doors', 'A+B', 'Ackermann function', '21 Game']
4 in Scilab ['15 Puzzle Game', '100 doors', 'Ackermann function', 'AKS test for primes']
4 in uBasic/4tH ['AKS test for primes', '100 doors', 'Abundant, deficient and perfect number classifications', '99 Bottles of Beer']
3 in Ursa ['100 doors', 'A+B', '99 Bottles of Beer']
2 in Caché ObjectScript ['100 doors']
2 in Klingphix ['Ackermann function', '99 Bottles of Beer']
2 in M2000 Interpreter ['A+B', 'Abstract type']
2 in PicoLisp ['AKS test for primes', 'ABC Problem']
2 in ERRE ['Address of a variable']
1 in 4DOS Batch ['100 doors']
1 in PostScript ['100 doors']
1 in Factor ['2048']
1 in R ['21 Game']
1 in OCaml ['99 Bottles of Beer']
1 in Excel ['A+B']
1 in Java ['A+B']
1 in Maxima ['A+B']
1 in Mercury ['A+B']
1 in J ['Abbreviations, automatic']
1 in Python ['Abelian sandpile model']
1 in GFA Basic ['Abundant, deficient and perfect number classifications']
1 in МК-61/52 ['Ackermann function']
1 in Nim ['Active object']
1 in Go ['Address of a variable']
1 in Smalltalk ['Address of a variable']
1 in COBOL ['Align columns']
1 in CoffeeScript ['Align columns']
</pre>
</pre>