Source code for meerqat.viz.html
# -*- coding: utf-8 -*-
"""
usage: html.py [-h] [--config CONFIG] [--print_config [={comments,skip_null,skip_default}+]]
[--n N] [--width WIDTH] [--passages_path PASSAGES_PATH] [--wiki_path WIKI_PATH]
[--search_run SEARCH_RUN] [--other_search_run OTHER_SEARCH_RUN] dataset_path output
options:
-h, --help Show this help message and exit.
--config CONFIG Path to a configuration file.
--print_config [={comments,skip_null,skip_default}+]
Print the configuration after applying all other arguments and exit.
Visualize dataset in HTML:
dataset_path (required, type: str)
output (required, type: str)
--n N Number of examples to output after shuffling. Defaults to all without
shuffling. (type: Optional[int], default: null)
--width WIDTH Width of the image in HTML [default: 400]. (type: int, default: 400)
--passages_path PASSAGES_PATH
(type: Optional[str], default: null)
--wiki_path WIKI_PATH
(type: Optional[str], default: null)
--search_run SEARCH_RUN
(type: Optional[str], default: null)
--other_search_run OTHER_SEARCH_RUN
(type: Optional[str], default: null)
"""
import json
from tqdm import tqdm
from jsonargparse import CLI
from datasets import load_from_disk
from ranx import Run
HTML_TEMPLATE = """<html>
<head>
<link rel="stylesheet" href="./styles.css">
</head>
<table>
{headers}
{rows}
</table>
</html>
"""
[docs]def get_top_1(item, run):
results = run.run[item['id']]
top1 = next(iter(results))
return int(top1)
[docs]def get_url_and_text(i, wiki, passages):
if passages is not None:
passage = passages[i]
article = wiki[passage['index']]
return article['url'], passage['passage']
else:
article = wiki[i]
return article['url'], article['wikipedia_title']
[docs]def format_html(
dataset_path: str,
output: str,
n: int = None,
width: int = 400,
passages_path: str = None,
wiki_path: str = None,
search_run: str = None,
other_search_run: str = None
):
"""
Visualize dataset in HTML
Parameters
----------
dataset_path: str
output: str
n: int
Number of examples to output after shuffling.
Defaults to all without shuffling.
width: int
Width of the image in HTML [default: 400].
passages_path: str
wiki_path: str
search_run: str
other_search_run: str
"""
# complete template according to parameters
if search_run is not None:
search_run = Run.from_file(search_run)
search_headers = f"""<th>Visual</th>
<th>Passage ({search_run.name})</th>"""
search_row_template = """<td><img src="{passage_url}" width="{width}"></td>
<td>{passage_text}</td>"""
else:
search_headers, search_row_template = "", ""
if other_search_run is not None:
other_search_run = Run.from_file(other_search_run)
other_search_headers = f"""<th>Other Visual</th>
<th>Passage ({other_search_run.name})</th>"""
other_search_row_template = """<td><img src="{other_passage_url}" width="{width}"></td>
<td>{other_passage_text}</td>"""
else:
other_search_headers, other_search_row_template = "", ""
headers = """<tr>
<th>Visual</th>
<th>Question</th>
<th>Answer</th>
%s
%s
</tr>""" % (search_headers, other_search_headers)
row_template = """<tr>
<td><img src="{url}" width="{width}"></td>
<td>{question}</td>
<td>{answer}</td>
%s
%s
</tr>""" % (search_row_template, other_search_row_template)
# load data
dataset = load_from_disk(dataset_path)
if n is not None:
dataset = dataset.shuffle().select(range(n))
if passages_path is not None:
passages = load_from_disk(passages_path)
else:
passages = None
if wiki_path is not None:
wiki = load_from_disk(wiki_path)
# do the actual formatting
rows = []
for item in tqdm(dataset):
row = dict(
url=item['url'],
width=width,
question=item['input'],
answer=item['output']['original_answer']
)
if search_run is not None:
i = get_top_1(item, search_run)
row['passage_url'], row['passage_text'] = get_url_and_text(i, wiki, passages)
if other_search_run is not None:
i = get_top_1(item, other_search_run)
row['other_passage_url'], row['other_passage_text'] = get_url_and_text(i, wiki, passages)
rows.append(row_template.format(**row))
html_str = HTML_TEMPLATE.format(headers=headers, rows='\n'.join(rows))
with open(output, 'wt') as file:
file.write(html_str)
if __name__ == '__main__':
CLI(format_html)