Source code for meerqat.data.wikidump

# coding: utf-8
"""
**input/output**: ``entities.json``  
Parses the dump (should be downloaded first, TODO add instructions), gathers images and assign them to the relevant entity given its common categories (retrieved in ``wiki.py commons rest``)  
Note that the wikicode is parsed very lazily and might need a second run depending on your application, e.g. templates are not expanded...

Usage: wikidump.py <subset>
"""
import bz2
import xml.etree.ElementTree as ET
from tqdm import tqdm
from docopt import docopt
import json
import re
import pandas as pd

from .loading import DATA_ROOT_PATH
from .wiki import VALID_ENCODING


NAMESPACE = {"mw": "http://www.mediawiki.org/xml/export-0.10/"}


[docs]def parse_file(path):
    if path.suffix == ".bz2":
        with bz2.open(path, "rb") as file:
            tree = ET.parse(file)
    else:
        tree = ET.parse(path)
    return tree


[docs]def find(element, tag, namespace=NAMESPACE):
    """test if element is None before returning ET.Element.find"""
    if element is None:
        return None
    return element.find(tag, namespace)


[docs]def find_text(element, tag, namespace=NAMESPACE):
    """returns result.text if result is not None"""
    result = find(element, tag, namespace)
    if result is None:
        return None
    return result.text


[docs]def get_field(wikitext, image, field):
    result = re.findall(rf"{field}=\s*(.+)\n", wikitext)
    if result:
        image[field.lower()] = result[0]
    return result


[docs]def process_article(article, entities, entity_categories):
    for page in article:
        title = find_text(page, "mw:title")
        # keep only files with valid encoding
        if title is None or not title.startswith("File:") or title.split('.')[-1].lower() not in VALID_ENCODING:
            continue

        revision = find(page, "mw:revision")
        if revision is None:
            continue
        wikitext = find_text(revision, "mw:text")
        if wikitext is None:
            continue

        # find categories
        categories = set()
        for internal_link in re.findall("\[\[(.+)\]\]", wikitext):
            if internal_link.lower().startswith("category:"):
                # remove name from link
                name = internal_link.find("|")
                if name >= 0:
                    internal_link = internal_link[: name]
                # make "Category" sentence-cased
                categories.add("C"+internal_link[1: ])
        # is there any entity with these categories?
        # note this also filters in case we did not find any category in wikitext
        if not (categories & entity_categories):
            continue

        image = {"categories": list(categories),
                 "timestamp": find_text(revision, "mw:timestamp")}
        contributor = find(revision, "mw:contributor")
        image["username"] = find_text(contributor, "mw:username")
        for field in ["Date", "Author"]:
            get_field(wikitext, image, field)

        description = re.search(r"description\s*=\s*(.+)", wikitext, flags=re.IGNORECASE|re.DOTALL|re.MULTILINE)
        if description is not None:
            description = description.group(1)
            i_new_field = description.find("\n|")
            if i_new_field >= 0:
                description = description[:i_new_field]
        image["description"] = description

        for license_match in re.finditer(r"{{int:license-header}}\s*=+", wikitext):
            license_ = re.findall("{{.+}}", wikitext[license_match.end():])
            if license_:
                image["license"] = license_[0]
            break

        # find entities with appropriate categories and save the image
        for entity in entities.values():
            if entity["n_questions"] < 1:
                continue
            if entity.get("categories", {}).keys() & categories:
                entity.setdefault("images", {})
                entity["images"][title] = image

    return entities


[docs]def process_articles(dump_path, entities):
    # set of all categories to enable faster search
    categories = {category for entity in entities.values() if entity["n_questions"] > 0
                           for category in entity.get("categories", {})}
    articles_path = list(dump_path.glob(r"commonswiki-latest-pages-articles[0-9]*"))
    for article_path in tqdm(articles_path, desc="Processing articles"):
        article = parse_file(article_path).getroot()
        process_article(article, entities, categories)
    return entities


if __name__ == "__main__":
    # parse arguments
    args = docopt(__doc__)
    subset = args['<subset>']

    # load entities
    subset_path = DATA_ROOT_PATH / f"meerqat_{subset}"
    path = subset_path / "entities.json"
    with open(path, 'r') as file:
        entities = json.load(file)

    dump_path = DATA_ROOT_PATH / "commonswiki"
    process_articles(dump_path, entities)

    # save output
    with open(path, 'w') as file:
        json.dump(entities, file)

    print(f"Successfully saved output to {path}")

    n_images = [len(entity.get('images', [])) for entity in entities.values()]
    print(f"Gathered images from {len(entities)} entities:\n{pd.DataFrame(n_images).describe()}")