merged stashed.

2023-04-22 16:32:37 -07:00
parent d43ed4658a
commit 086d858c3b
2 changed files with 54 additions and 27 deletions
--- a/src/broken_links.py
+++ b/src/broken_links.py
@@ -7,16 +7,14 @@ import matplotlib.pyplot as plt
 DB = connect()
 DB.sql("""
 with cte as (
    select 
-        count(1) as cnt 
+        id
        ,url
    from stories 
-    group by url, outlet
+""")
-)
+
-select
+DB.sql("""
-    cast(sum(cnt) filter (where cnt = 1) as float)
+    describe stories 
    / sum(cnt) filter (where cnt > 1) as dups
 from cte
 """)
 sns.histplot(x=hist['cnt'])
--- a/src/word.py
+++ b/src/word.py
@@ -1,9 +1,10 @@
 import click
 from scipy.spatial import distance
 from transformers import AutoTokenizer, RobertaModel
 import numpy as np
-from model import Model
+from data import Data, from_db, connect, data_dir
-from data import Data, from_db, connect
+from tqdm import tqdm
 import torch
 from pathlib import Path
@click.group()
 def cli():
@@ -27,33 +28,61 @@ def max_sequence():
@cli.command()
 def train():
    table = from_db(Data.Titles)
    n_classes = 10
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
 def embed(chunks, embedding_dest, token_dest):
    """ given titles, generate tokens and word embeddings and saves to disk """
    # init models
    device = torch.device('cuda:0')
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    model = RobertaModel.from_pretrained("roberta-base")
    model.to(device)
    # load data
    db = connect()
    table = db.sql("""
        select
        title
        from stories
        order by id desc
    """).df()
    # normalize text
    table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    # generate embeddings from list of titles
    def get_embeddings(titles):
        # create tokens, padding to max width 
-        tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt")
+        tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
-        outputs = model(**tokens)
+        tokens = tokens.to(device)
-        return outputs.last_hidden_state[:, 0, :]
+        with torch.no_grad():
            outputs = model(**tokens)
            #outputs = outputs.to(torch.device('cpu'))
        return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
-    titles = table['title'].apply(str).to_list()[:10]
+    tokens = []
-    get_embeddings(titles)
+    embeddings = []
    chunks = np.array_split(table['title'].to_numpy(), chunks)
    chunk_iter = tqdm(chunks, 'embedding')
    for chunk in chunk_iter:
        data = chunk.tolist()
        token, embedding = get_embeddings(data)
        arr = embedding.detach().numpy()
        embeddings.append(arr)
        tokens.append(token)
-    outputs.last_hidden_state[0][200:]
+    embeddings = np.concatenate(embeddings)
-    outputs.values().shape
+    tokens = np.concatenate(tokens)
-    model
+    np.save(embedding_dest, embeddings)
-
+    np.save(token_dest, tokens)
    # linear = torch.nn.Linear(model.config.hidden_size, n_classes)
    # act = torch.nn.Sigmoid()
    # model = Model()
    classes = act(linear(pred_y.last_hidden_state[:, 0, :])).detach()
@cli.command()
 def distance():
    """TODO: measure distance between sequence embeddings"""
    distances = distance.cdist(classes, classes, 'euclidean')
    np.fill_diagonal(distances, np.inf)
    min_index = (np.argmin(distances))