Source code for meerqat.viz.html

# -*- coding: utf-8 -*-
"""
usage: html.py [-h] [--config CONFIG] [--print_config [={comments,skip_null,skip_default}+]] 
[--n N] [--width WIDTH] [--passages_path PASSAGES_PATH] [--wiki_path WIKI_PATH] 
[--search_run SEARCH_RUN] [--other_search_run OTHER_SEARCH_RUN] dataset_path output 

options:
  -h, --help            Show this help message and exit.
  --config CONFIG       Path to a configuration file.
  --print_config [={comments,skip_null,skip_default}+]
                        Print the configuration after applying all other arguments and exit.

Visualize dataset in HTML:
  dataset_path          (required, type: str)
  output                (required, type: str)
  --n N                 Number of examples to output after shuffling. Defaults to all without
                        shuffling. (type: Optional[int], default: null)
  --width WIDTH         Width of the image in HTML [default: 400]. (type: int, default: 400)
  --passages_path PASSAGES_PATH
                        (type: Optional[str], default: null)
  --wiki_path WIKI_PATH
                        (type: Optional[str], default: null)
  --search_run SEARCH_RUN
                        (type: Optional[str], default: null)
  --other_search_run OTHER_SEARCH_RUN
                        (type: Optional[str], default: null)
"""
import json
from tqdm import tqdm
from jsonargparse import CLI

from datasets import load_from_disk
from ranx import Run


HTML_TEMPLATE = """<html>
<head>
    <link rel="stylesheet" href="./styles.css">
</head>
<table>
    {headers}
    {rows}
</table>
</html>
"""


[docs]def get_top_1(item, run): results = run.run[item['id']] top1 = next(iter(results)) return int(top1)
[docs]def get_url_and_text(i, wiki, passages): if passages is not None: passage = passages[i] article = wiki[passage['index']] return article['url'], passage['passage'] else: article = wiki[i] return article['url'], article['wikipedia_title']
[docs]def format_html( dataset_path: str, output: str, n: int = None, width: int = 400, passages_path: str = None, wiki_path: str = None, search_run: str = None, other_search_run: str = None ): """ Visualize dataset in HTML Parameters ---------- dataset_path: str output: str n: int Number of examples to output after shuffling. Defaults to all without shuffling. width: int Width of the image in HTML [default: 400]. passages_path: str wiki_path: str search_run: str other_search_run: str """ # complete template according to parameters if search_run is not None: search_run = Run.from_file(search_run) search_headers = f"""<th>Visual</th> <th>Passage ({search_run.name})</th>""" search_row_template = """<td><img src="{passage_url}" width="{width}"></td> <td>{passage_text}</td>""" else: search_headers, search_row_template = "", "" if other_search_run is not None: other_search_run = Run.from_file(other_search_run) other_search_headers = f"""<th>Other Visual</th> <th>Passage ({other_search_run.name})</th>""" other_search_row_template = """<td><img src="{other_passage_url}" width="{width}"></td> <td>{other_passage_text}</td>""" else: other_search_headers, other_search_row_template = "", "" headers = """<tr> <th>Visual</th> <th>Question</th> <th>Answer</th> %s %s </tr>""" % (search_headers, other_search_headers) row_template = """<tr> <td><img src="{url}" width="{width}"></td> <td>{question}</td> <td>{answer}</td> %s %s </tr>""" % (search_row_template, other_search_row_template) # load data dataset = load_from_disk(dataset_path) if n is not None: dataset = dataset.shuffle().select(range(n)) if passages_path is not None: passages = load_from_disk(passages_path) else: passages = None if wiki_path is not None: wiki = load_from_disk(wiki_path) # do the actual formatting rows = [] for item in tqdm(dataset): row = dict( url=item['url'], width=width, question=item['input'], answer=item['output']['original_answer'] ) if search_run is not None: i = get_top_1(item, search_run) row['passage_url'], row['passage_text'] = get_url_and_text(i, wiki, passages) if other_search_run is not None: i = get_top_1(item, other_search_run) row['other_passage_url'], row['other_passage_text'] = get_url_and_text(i, wiki, passages) rows.append(row_template.format(**row)) html_str = HTML_TEMPLATE.format(headers=headers, rows='\n'.join(rows)) with open(output, 'wt') as file: file.write(html_str)
if __name__ == '__main__': CLI(format_html)