Source code for meerqat.data.wikidump

# coding: utf-8
"""
**input/output**: ``entities.json``  
Parses the dump (should be downloaded first, TODO add instructions), gathers images and assign them to the relevant entity given its common categories (retrieved in ``wiki.py commons rest``)  
Note that the wikicode is parsed very lazily and might need a second run depending on your application, e.g. templates are not expanded...

Usage: wikidump.py <subset>
"""
import bz2
import xml.etree.ElementTree as ET
from tqdm import tqdm
from docopt import docopt
import json
import re
import pandas as pd

from .loading import DATA_ROOT_PATH
from .wiki import VALID_ENCODING


NAMESPACE = {"mw": "http://www.mediawiki.org/xml/export-0.10/"}


[docs]def parse_file(path): if path.suffix == ".bz2": with bz2.open(path, "rb") as file: tree = ET.parse(file) else: tree = ET.parse(path) return tree
[docs]def find(element, tag, namespace=NAMESPACE): """test if element is None before returning ET.Element.find""" if element is None: return None return element.find(tag, namespace)
[docs]def find_text(element, tag, namespace=NAMESPACE): """returns result.text if result is not None""" result = find(element, tag, namespace) if result is None: return None return result.text
[docs]def get_field(wikitext, image, field): result = re.findall(rf"{field}=\s*(.+)\n", wikitext) if result: image[field.lower()] = result[0] return result
[docs]def process_article(article, entities, entity_categories): for page in article: title = find_text(page, "mw:title") # keep only files with valid encoding if title is None or not title.startswith("File:") or title.split('.')[-1].lower() not in VALID_ENCODING: continue revision = find(page, "mw:revision") if revision is None: continue wikitext = find_text(revision, "mw:text") if wikitext is None: continue # find categories categories = set() for internal_link in re.findall("\[\[(.+)\]\]", wikitext): if internal_link.lower().startswith("category:"): # remove name from link name = internal_link.find("|") if name >= 0: internal_link = internal_link[: name] # make "Category" sentence-cased categories.add("C"+internal_link[1: ]) # is there any entity with these categories? # note this also filters in case we did not find any category in wikitext if not (categories & entity_categories): continue image = {"categories": list(categories), "timestamp": find_text(revision, "mw:timestamp")} contributor = find(revision, "mw:contributor") image["username"] = find_text(contributor, "mw:username") for field in ["Date", "Author"]: get_field(wikitext, image, field) description = re.search(r"description\s*=\s*(.+)", wikitext, flags=re.IGNORECASE|re.DOTALL|re.MULTILINE) if description is not None: description = description.group(1) i_new_field = description.find("\n|") if i_new_field >= 0: description = description[:i_new_field] image["description"] = description for license_match in re.finditer(r"{{int:license-header}}\s*=+", wikitext): license_ = re.findall("{{.+}}", wikitext[license_match.end():]) if license_: image["license"] = license_[0] break # find entities with appropriate categories and save the image for entity in entities.values(): if entity["n_questions"] < 1: continue if entity.get("categories", {}).keys() & categories: entity.setdefault("images", {}) entity["images"][title] = image return entities
[docs]def process_articles(dump_path, entities): # set of all categories to enable faster search categories = {category for entity in entities.values() if entity["n_questions"] > 0 for category in entity.get("categories", {})} articles_path = list(dump_path.glob(r"commonswiki-latest-pages-articles[0-9]*")) for article_path in tqdm(articles_path, desc="Processing articles"): article = parse_file(article_path).getroot() process_article(article, entities, categories) return entities
if __name__ == "__main__": # parse arguments args = docopt(__doc__) subset = args['<subset>'] # load entities subset_path = DATA_ROOT_PATH / f"meerqat_{subset}" path = subset_path / "entities.json" with open(path, 'r') as file: entities = json.load(file) dump_path = DATA_ROOT_PATH / "commonswiki" process_articles(dump_path, entities) # save output with open(path, 'w') as file: json.dump(entities, file) print(f"Successfully saved output to {path}") n_images = [len(entity.get('images', [])) for entity in entities.values()] print(f"Gathered images from {len(entities)} entities:\n{pd.DataFrame(n_images).describe()}")