merged stashed.

This commit is contained in:
Matt 2023-04-22 16:32:37 -07:00
parent d43ed4658a
commit 086d858c3b
2 changed files with 54 additions and 27 deletions

View File

@ -7,16 +7,14 @@ import matplotlib.pyplot as plt
DB = connect() DB = connect()
DB.sql(""" DB.sql("""
with cte as (
select select
count(1) as cnt id
,url
from stories from stories
group by url, outlet """)
)
select DB.sql("""
cast(sum(cnt) filter (where cnt = 1) as float) describe stories
/ sum(cnt) filter (where cnt > 1) as dups
from cte
""") """)
sns.histplot(x=hist['cnt']) sns.histplot(x=hist['cnt'])

View File

@ -1,9 +1,10 @@
import click import click
from scipy.spatial import distance
from transformers import AutoTokenizer, RobertaModel from transformers import AutoTokenizer, RobertaModel
import numpy as np import numpy as np
from model import Model from data import Data, from_db, connect, data_dir
from data import Data, from_db, connect from tqdm import tqdm
import torch
from pathlib import Path
@click.group() @click.group()
def cli(): def cli():
@ -27,33 +28,61 @@ def max_sequence():
@cli.command() @cli.command()
def train(): def train():
table = from_db(Data.Titles) table = from_db(Data.Titles)
n_classes = 10 n_classes = 10
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
def embed(chunks, embedding_dest, token_dest):
""" given titles, generate tokens and word embeddings and saves to disk """
# init models
device = torch.device('cuda:0')
tokenizer = AutoTokenizer.from_pretrained("roberta-base") tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base") model = RobertaModel.from_pretrained("roberta-base")
model.to(device)
# load data
db = connect()
table = db.sql("""
select
title
from stories
order by id desc
""").df()
# normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
# generate embeddings from list of titles
def get_embeddings(titles): def get_embeddings(titles):
# create tokens, padding to max width # create tokens, padding to max width
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt") tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
outputs = model(**tokens) tokens = tokens.to(device)
return outputs.last_hidden_state[:, 0, :] with torch.no_grad():
outputs = model(**tokens)
#outputs = outputs.to(torch.device('cpu'))
return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
titles = table['title'].apply(str).to_list()[:10] tokens = []
get_embeddings(titles) embeddings = []
chunks = np.array_split(table['title'].to_numpy(), chunks)
chunk_iter = tqdm(chunks, 'embedding')
for chunk in chunk_iter:
data = chunk.tolist()
token, embedding = get_embeddings(data)
arr = embedding.detach().numpy()
embeddings.append(arr)
tokens.append(token)
outputs.last_hidden_state[0][200:] embeddings = np.concatenate(embeddings)
outputs.values().shape tokens = np.concatenate(tokens)
model np.save(embedding_dest, embeddings)
np.save(token_dest, tokens)
# linear = torch.nn.Linear(model.config.hidden_size, n_classes)
# act = torch.nn.Sigmoid()
# model = Model()
classes = act(linear(pred_y.last_hidden_state[:, 0, :])).detach()
@cli.command() @cli.command()
def distance(): def distance():
"""TODO: measure distance between sequence embeddings"""
distances = distance.cdist(classes, classes, 'euclidean') distances = distance.cdist(classes, classes, 'euclidean')
np.fill_diagonal(distances, np.inf) np.fill_diagonal(distances, np.inf)
min_index = (np.argmin(distances)) min_index = (np.argmin(distances))