merged stashed.

2023-04-22 16:32:37 -07:00 · 2023-04-22 16:32:37 -07:00 · 086d858c3b
parent d43ed4658a
commit 086d858c3b
2 changed files with 54 additions and 27 deletions
--- a/src/broken_links.py
+++ b/src/broken_links.py
@ -7,16 +7,14 @@ import matplotlib.pyplot as plt
 DB = connect()

 DB.sql("""
-with cte as (
    select 
-        count(1) as cnt 
+        id
+        ,url
    from stories 
-    group by url, outlet
-)
-select
-    cast(sum(cnt) filter (where cnt = 1) as float)
-    / sum(cnt) filter (where cnt > 1) as dups
-from cte
+""")
+
+DB.sql("""
+    describe stories 
 """)

 sns.histplot(x=hist['cnt'])
--- a/src/word.py
+++ b/src/word.py
@ -1,9 +1,10 @@
 import click
-from scipy.spatial import distance
 from transformers import AutoTokenizer, RobertaModel
 import numpy as np
-from model import Model
-from data import Data, from_db, connect
+from data import Data, from_db, connect, data_dir
+from tqdm import tqdm
+import torch
+from pathlib import Path

@click.group()
 def cli():
@ -27,33 +28,61 @@ def max_sequence():
@cli.command()
 def train():
    table = from_db(Data.Titles)
-
-
    n_classes = 10
+
+@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
+@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
+@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
+def embed(chunks, embedding_dest, token_dest):
+    """ given titles, generate tokens and word embeddings and saves to disk """
+
+    # init models
+    device = torch.device('cuda:0')
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    model = RobertaModel.from_pretrained("roberta-base")
+    model.to(device)

+    # load data
+    db = connect()
+    table = db.sql("""
+        select
+        title
+        from stories
+        order by id desc
+    """).df()
+
+    # normalize text
+    table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
+
+    # generate embeddings from list of titles
    def get_embeddings(titles):
        # create tokens, padding to max width 
-        tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt")
+        tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
+        tokens = tokens.to(device)
+        with torch.no_grad():
            outputs = model(**tokens)
-        return outputs.last_hidden_state[:, 0, :]
+            #outputs = outputs.to(torch.device('cpu'))
+        return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))

-    titles = table['title'].apply(str).to_list()[:10]
-    get_embeddings(titles)
+    tokens = []
+    embeddings = []
+    chunks = np.array_split(table['title'].to_numpy(), chunks)
+    chunk_iter = tqdm(chunks, 'embedding')
+    for chunk in chunk_iter:
+        data = chunk.tolist()
+        token, embedding = get_embeddings(data)
+        arr = embedding.detach().numpy()
+        embeddings.append(arr)
+        tokens.append(token)

-    outputs.last_hidden_state[0][200:]
-    outputs.values().shape
-    model
-
-    # linear = torch.nn.Linear(model.config.hidden_size, n_classes)
-    # act = torch.nn.Sigmoid()
-
-    # model = Model()
-    classes = act(linear(pred_y.last_hidden_state[:, 0, :])).detach()
+    embeddings = np.concatenate(embeddings)
+    tokens = np.concatenate(tokens)
+    np.save(embedding_dest, embeddings)
+    np.save(token_dest, tokens)

@cli.command()
 def distance():
+    """TODO: measure distance between sequence embeddings"""
    distances = distance.cdist(classes, classes, 'euclidean')
    np.fill_diagonal(distances, np.inf)
    min_index = (np.argmin(distances))