# -*- coding: utf-8 -*-
"""Usage: umap.py <dataset> <key> <output> [<config>]"""
from docopt import docopt
import json
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import umap
from bokeh.plotting import figure, output_file, save
from bokeh.models import HoverTool, ColumnDataSource, LinearColorMapper, ColorBar
from datasets import load_from_disk
import ranx
import urllib
from ..data.wiki import thumbnail_to_file_name, file_name_to_thumbnail
[docs]def reduce(embeddings, metric='cosine'):
reducer = umap.UMAP(metric=metric)
reducer.fit(embeddings)
reduced_embeddings = reducer.transform(embeddings)
return reduced_embeddings
[docs]def fplot(reduced_embeddings, figsize=(20,20), alpha=0.5, s=5):
fig, ax = plt.subplots(figsize=figsize)
ax.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], alpha=alpha, s=s)
return fig
[docs]def iplot(reduced_embeddings, dataset, urls, input_key='input', thumb_width=128, title='UMAP projection',
plot_width=600, plot_height=600, tools=('pan, wheel_zoom, reset'),
line_alpha=0.6, fill_alpha=0.6, size=4, metric=None, palette='Viridis256'):
thumbnails = [file_name_to_thumbnail(thumbnail_to_file_name(urllib.parse.unquote(url)), thumb_width) for url in urls]
df = pd.DataFrame(reduced_embeddings, columns=('x', 'y'))
if input_key is None:
df['text'] = ['']*len(dataset)
else:
df['text'] = dataset[input_key]
df['image'] = thumbnails
if metric is not None:
df[metric] = dataset[metric]
cmap = LinearColorMapper(palette=palette, low = df[metric].min(), high = df[metric].max())
fill_color = {'field': metric, 'transform': cmap}
else:
fill_color = 'gray'
cmap = None
datasource = ColumnDataSource(df)
fig = figure(title=title, plot_width=plot_width, plot_height=plot_height, tools=tools)
fig.add_tools(HoverTool(tooltips="""
<div>
<div>
<img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
</div>
<div>
<span style='font-size: 10px'>@text</span>
</div>
</div>
"""))
fig.circle(
'x',
'y',
source=datasource,
line_alpha=line_alpha,
fill_alpha=fill_alpha,
size=size,
fill_color=fill_color
)
if cmap is not None:
cb = ColorBar(color_mapper = cmap)
fig.add_layout(cb, 'right')
return fig
[docs]def get_ranx_run(qrels_path, run_path, metric='mrr'):
qrels = ranx.Qrels.from_file(qrels_path)
run = ranx.Run.from_file(run_path)
ranx.evaluate(qrels, run, metrics=metric)
return run, metric
[docs]def main(dataset, key, output_path, image_kb=None, shard=None, url_key='url',
reduce_kwargs={}, fplot_kwargs={}, iplot_kwargs={}, ranx_kwargs=None, face=False):
output_path.mkdir(exist_ok=True)
if shard is not None:
dataset = dataset.shuffle(seed=0).shard(shard, 0)
if face:
dataset = dataset.filter(lambda x: x is not None, input_columns=key)
embeddings = np.array([x[0] for x in dataset[key]])
else:
embeddings = np.array(dataset[key])
# add features from the image KB
if image_kb is not None:
image_kb = load_from_disk(image_kb)
print(image_kb)
indices = dataset['index']
print(len(indices))
urls = image_kb.select(indices)[url_key]
print(len(urls))
else:
urls = dataset[url_key]
if ranx_kwargs is not None:
ranx_run, metric = get_ranx_run(**ranx_kwargs)
dataset=dataset.map(lambda item: {metric: ranx_run.scores[metric][item["id"]]})
else:
ranx_run, metric = None, None
reduced_embeddings = reduce(embeddings, **reduce_kwargs)
ffig = fplot(reduced_embeddings, **fplot_kwargs)
ffig.savefig(output_path/f"umap_{key}.png")
ifig = iplot(reduced_embeddings, dataset, urls=urls, metric=metric, **iplot_kwargs)
output_file(output_path/f"umap_{key}.html")
save(ifig)
if __name__ == '__main__':
args = docopt(__doc__)
dataset = load_from_disk(args['<dataset>'])
config_path = args['<config>']
if config_path is not None:
with open(config_path, 'rt') as file:
config = json.load(file)
else:
config = {}
output_path = Path(args['<output>'])
main(dataset, args['<key>'], output_path, **config)