diff --git a/src/broken_links.py b/src/broken_links.py index 565b9d4..110e691 100644 --- a/src/broken_links.py +++ b/src/broken_links.py @@ -7,16 +7,14 @@ import matplotlib.pyplot as plt DB = connect() DB.sql(""" -with cte as ( select - count(1) as cnt + id + ,url from stories - group by url, outlet -) -select - cast(sum(cnt) filter (where cnt = 1) as float) - / sum(cnt) filter (where cnt > 1) as dups -from cte +""") + +DB.sql(""" + describe stories """) sns.histplot(x=hist['cnt']) diff --git a/src/word.py b/src/word.py index 93ba245..b78e8ce 100644 --- a/src/word.py +++ b/src/word.py @@ -1,9 +1,10 @@ import click -from scipy.spatial import distance from transformers import AutoTokenizer, RobertaModel import numpy as np -from model import Model -from data import Data, from_db, connect +from data import Data, from_db, connect, data_dir +from tqdm import tqdm +import torch +from pathlib import Path @click.group() def cli(): @@ -27,33 +28,61 @@ def max_sequence(): @cli.command() def train(): table = from_db(Data.Titles) - - n_classes = 10 + +@click.option('-c', '--chunks', type=int, default=5000, show_default=True) +@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True) +@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True) +def embed(chunks, embedding_dest, token_dest): + """ given titles, generate tokens and word embeddings and saves to disk """ + + # init models + device = torch.device('cuda:0') tokenizer = AutoTokenizer.from_pretrained("roberta-base") model = RobertaModel.from_pretrained("roberta-base") + model.to(device) + # load data + db = connect() + table = db.sql(""" + select + title + from stories + order by id desc + """).df() + + # normalize text + table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') + + # generate embeddings from list of titles def get_embeddings(titles): # create tokens, padding to max width - tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt") - outputs = model(**tokens) - return outputs.last_hidden_state[:, 0, :] + tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt") + tokens = tokens.to(device) + with torch.no_grad(): + outputs = model(**tokens) + #outputs = outputs.to(torch.device('cpu')) + return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu')) - titles = table['title'].apply(str).to_list()[:10] - get_embeddings(titles) + tokens = [] + embeddings = [] + chunks = np.array_split(table['title'].to_numpy(), chunks) + chunk_iter = tqdm(chunks, 'embedding') + for chunk in chunk_iter: + data = chunk.tolist() + token, embedding = get_embeddings(data) + arr = embedding.detach().numpy() + embeddings.append(arr) + tokens.append(token) - outputs.last_hidden_state[0][200:] - outputs.values().shape - model - - # linear = torch.nn.Linear(model.config.hidden_size, n_classes) - # act = torch.nn.Sigmoid() - - # model = Model() - classes = act(linear(pred_y.last_hidden_state[:, 0, :])).detach() + embeddings = np.concatenate(embeddings) + tokens = np.concatenate(tokens) + np.save(embedding_dest, embeddings) + np.save(token_dest, tokens) @cli.command() def distance(): + """TODO: measure distance between sequence embeddings""" distances = distance.cdist(classes, classes, 'euclidean') np.fill_diagonal(distances, np.inf) min_index = (np.argmin(distances))