merged stashed.
This commit is contained in:
parent
d43ed4658a
commit
086d858c3b
|
@ -7,16 +7,14 @@ import matplotlib.pyplot as plt
|
|||
DB = connect()
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
count(1) as cnt
|
||||
id
|
||||
,url
|
||||
from stories
|
||||
group by url, outlet
|
||||
)
|
||||
select
|
||||
cast(sum(cnt) filter (where cnt = 1) as float)
|
||||
/ sum(cnt) filter (where cnt > 1) as dups
|
||||
from cte
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
describe stories
|
||||
""")
|
||||
|
||||
sns.histplot(x=hist['cnt'])
|
||||
|
|
65
src/word.py
65
src/word.py
|
@ -1,9 +1,10 @@
|
|||
import click
|
||||
from scipy.spatial import distance
|
||||
from transformers import AutoTokenizer, RobertaModel
|
||||
import numpy as np
|
||||
from model import Model
|
||||
from data import Data, from_db, connect
|
||||
from data import Data, from_db, connect, data_dir
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
|
@ -27,33 +28,61 @@ def max_sequence():
|
|||
@cli.command()
|
||||
def train():
|
||||
table = from_db(Data.Titles)
|
||||
|
||||
|
||||
n_classes = 10
|
||||
|
||||
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
|
||||
@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
|
||||
@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
|
||||
def embed(chunks, embedding_dest, token_dest):
|
||||
""" given titles, generate tokens and word embeddings and saves to disk """
|
||||
|
||||
# init models
|
||||
device = torch.device('cuda:0')
|
||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||
model = RobertaModel.from_pretrained("roberta-base")
|
||||
model.to(device)
|
||||
|
||||
# load data
|
||||
db = connect()
|
||||
table = db.sql("""
|
||||
select
|
||||
title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
|
||||
# normalize text
|
||||
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
||||
|
||||
# generate embeddings from list of titles
|
||||
def get_embeddings(titles):
|
||||
# create tokens, padding to max width
|
||||
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt")
|
||||
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
|
||||
tokens = tokens.to(device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**tokens)
|
||||
return outputs.last_hidden_state[:, 0, :]
|
||||
#outputs = outputs.to(torch.device('cpu'))
|
||||
return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
|
||||
|
||||
titles = table['title'].apply(str).to_list()[:10]
|
||||
get_embeddings(titles)
|
||||
tokens = []
|
||||
embeddings = []
|
||||
chunks = np.array_split(table['title'].to_numpy(), chunks)
|
||||
chunk_iter = tqdm(chunks, 'embedding')
|
||||
for chunk in chunk_iter:
|
||||
data = chunk.tolist()
|
||||
token, embedding = get_embeddings(data)
|
||||
arr = embedding.detach().numpy()
|
||||
embeddings.append(arr)
|
||||
tokens.append(token)
|
||||
|
||||
outputs.last_hidden_state[0][200:]
|
||||
outputs.values().shape
|
||||
model
|
||||
|
||||
# linear = torch.nn.Linear(model.config.hidden_size, n_classes)
|
||||
# act = torch.nn.Sigmoid()
|
||||
|
||||
# model = Model()
|
||||
classes = act(linear(pred_y.last_hidden_state[:, 0, :])).detach()
|
||||
embeddings = np.concatenate(embeddings)
|
||||
tokens = np.concatenate(tokens)
|
||||
np.save(embedding_dest, embeddings)
|
||||
np.save(token_dest, tokens)
|
||||
|
||||
@cli.command()
|
||||
def distance():
|
||||
"""TODO: measure distance between sequence embeddings"""
|
||||
distances = distance.cdist(classes, classes, 'euclidean')
|
||||
np.fill_diagonal(distances, np.inf)
|
||||
min_index = (np.argmin(distances))
|
||||
|
|
Loading…
Reference in New Issue