merged stashed.

This commit is contained in:
Matt 2023-04-22 16:32:37 -07:00
parent d43ed4658a
commit 086d858c3b
2 changed files with 54 additions and 27 deletions

View File

@ -7,16 +7,14 @@ import matplotlib.pyplot as plt
DB = connect()
DB.sql("""
with cte as (
select
count(1) as cnt
id
,url
from stories
group by url, outlet
)
select
cast(sum(cnt) filter (where cnt = 1) as float)
/ sum(cnt) filter (where cnt > 1) as dups
from cte
""")
DB.sql("""
describe stories
""")
sns.histplot(x=hist['cnt'])

View File

@ -1,9 +1,10 @@
import click
from scipy.spatial import distance
from transformers import AutoTokenizer, RobertaModel
import numpy as np
from model import Model
from data import Data, from_db, connect
from data import Data, from_db, connect, data_dir
from tqdm import tqdm
import torch
from pathlib import Path
@click.group()
def cli():
@ -27,33 +28,61 @@ def max_sequence():
@cli.command()
def train():
table = from_db(Data.Titles)
n_classes = 10
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
def embed(chunks, embedding_dest, token_dest):
""" given titles, generate tokens and word embeddings and saves to disk """
# init models
device = torch.device('cuda:0')
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")
model.to(device)
# load data
db = connect()
table = db.sql("""
select
title
from stories
order by id desc
""").df()
# normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
# generate embeddings from list of titles
def get_embeddings(titles):
# create tokens, padding to max width
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt")
outputs = model(**tokens)
return outputs.last_hidden_state[:, 0, :]
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
tokens = tokens.to(device)
with torch.no_grad():
outputs = model(**tokens)
#outputs = outputs.to(torch.device('cpu'))
return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
titles = table['title'].apply(str).to_list()[:10]
get_embeddings(titles)
tokens = []
embeddings = []
chunks = np.array_split(table['title'].to_numpy(), chunks)
chunk_iter = tqdm(chunks, 'embedding')
for chunk in chunk_iter:
data = chunk.tolist()
token, embedding = get_embeddings(data)
arr = embedding.detach().numpy()
embeddings.append(arr)
tokens.append(token)
outputs.last_hidden_state[0][200:]
outputs.values().shape
model
# linear = torch.nn.Linear(model.config.hidden_size, n_classes)
# act = torch.nn.Sigmoid()
# model = Model()
classes = act(linear(pred_y.last_hidden_state[:, 0, :])).detach()
embeddings = np.concatenate(embeddings)
tokens = np.concatenate(tokens)
np.save(embedding_dest, embeddings)
np.save(token_dest, tokens)
@cli.command()
def distance():
"""TODO: measure distance between sequence embeddings"""
distances = distance.cdist(classes, classes, 'euclidean')
np.fill_diagonal(distances, np.inf)
min_index = (np.argmin(distances))