merged stashed.
This commit is contained in:
parent
d43ed4658a
commit
086d858c3b
|
@ -7,16 +7,14 @@ import matplotlib.pyplot as plt
|
||||||
DB = connect()
|
DB = connect()
|
||||||
|
|
||||||
DB.sql("""
|
DB.sql("""
|
||||||
with cte as (
|
|
||||||
select
|
select
|
||||||
count(1) as cnt
|
id
|
||||||
|
,url
|
||||||
from stories
|
from stories
|
||||||
group by url, outlet
|
""")
|
||||||
)
|
|
||||||
select
|
DB.sql("""
|
||||||
cast(sum(cnt) filter (where cnt = 1) as float)
|
describe stories
|
||||||
/ sum(cnt) filter (where cnt > 1) as dups
|
|
||||||
from cte
|
|
||||||
""")
|
""")
|
||||||
|
|
||||||
sns.histplot(x=hist['cnt'])
|
sns.histplot(x=hist['cnt'])
|
||||||
|
|
67
src/word.py
67
src/word.py
|
@ -1,9 +1,10 @@
|
||||||
import click
|
import click
|
||||||
from scipy.spatial import distance
|
|
||||||
from transformers import AutoTokenizer, RobertaModel
|
from transformers import AutoTokenizer, RobertaModel
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from model import Model
|
from data import Data, from_db, connect, data_dir
|
||||||
from data import Data, from_db, connect
|
from tqdm import tqdm
|
||||||
|
import torch
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
def cli():
|
def cli():
|
||||||
|
@ -27,33 +28,61 @@ def max_sequence():
|
||||||
@cli.command()
|
@cli.command()
|
||||||
def train():
|
def train():
|
||||||
table = from_db(Data.Titles)
|
table = from_db(Data.Titles)
|
||||||
|
|
||||||
|
|
||||||
n_classes = 10
|
n_classes = 10
|
||||||
|
|
||||||
|
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
|
||||||
|
@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
|
||||||
|
@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
|
||||||
|
def embed(chunks, embedding_dest, token_dest):
|
||||||
|
""" given titles, generate tokens and word embeddings and saves to disk """
|
||||||
|
|
||||||
|
# init models
|
||||||
|
device = torch.device('cuda:0')
|
||||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||||
model = RobertaModel.from_pretrained("roberta-base")
|
model = RobertaModel.from_pretrained("roberta-base")
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
# load data
|
||||||
|
db = connect()
|
||||||
|
table = db.sql("""
|
||||||
|
select
|
||||||
|
title
|
||||||
|
from stories
|
||||||
|
order by id desc
|
||||||
|
""").df()
|
||||||
|
|
||||||
|
# normalize text
|
||||||
|
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
||||||
|
|
||||||
|
# generate embeddings from list of titles
|
||||||
def get_embeddings(titles):
|
def get_embeddings(titles):
|
||||||
# create tokens, padding to max width
|
# create tokens, padding to max width
|
||||||
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt")
|
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
|
||||||
outputs = model(**tokens)
|
tokens = tokens.to(device)
|
||||||
return outputs.last_hidden_state[:, 0, :]
|
with torch.no_grad():
|
||||||
|
outputs = model(**tokens)
|
||||||
|
#outputs = outputs.to(torch.device('cpu'))
|
||||||
|
return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
|
||||||
|
|
||||||
titles = table['title'].apply(str).to_list()[:10]
|
tokens = []
|
||||||
get_embeddings(titles)
|
embeddings = []
|
||||||
|
chunks = np.array_split(table['title'].to_numpy(), chunks)
|
||||||
|
chunk_iter = tqdm(chunks, 'embedding')
|
||||||
|
for chunk in chunk_iter:
|
||||||
|
data = chunk.tolist()
|
||||||
|
token, embedding = get_embeddings(data)
|
||||||
|
arr = embedding.detach().numpy()
|
||||||
|
embeddings.append(arr)
|
||||||
|
tokens.append(token)
|
||||||
|
|
||||||
outputs.last_hidden_state[0][200:]
|
embeddings = np.concatenate(embeddings)
|
||||||
outputs.values().shape
|
tokens = np.concatenate(tokens)
|
||||||
model
|
np.save(embedding_dest, embeddings)
|
||||||
|
np.save(token_dest, tokens)
|
||||||
# linear = torch.nn.Linear(model.config.hidden_size, n_classes)
|
|
||||||
# act = torch.nn.Sigmoid()
|
|
||||||
|
|
||||||
# model = Model()
|
|
||||||
classes = act(linear(pred_y.last_hidden_state[:, 0, :])).detach()
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
def distance():
|
def distance():
|
||||||
|
"""TODO: measure distance between sequence embeddings"""
|
||||||
distances = distance.cdist(classes, classes, 'euclidean')
|
distances = distance.cdist(classes, classes, 'euclidean')
|
||||||
np.fill_diagonal(distances, np.inf)
|
np.fill_diagonal(distances, np.inf)
|
||||||
min_index = (np.argmin(distances))
|
min_index = (np.argmin(distances))
|
||||||
|
|
Loading…
Reference in New Issue