From 086d858c3b4386cbff2948339ab0cf505c697ed0 Mon Sep 17 00:00:00 2001
From: Matt <git@publicmatt.com>
Date: Sat, 22 Apr 2023 16:32:37 -0700
Subject: [PATCH] merged stashed.

---
 src/broken_links.py | 14 ++++------
 src/word.py         | 67 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/src/broken_links.py b/src/broken_links.py
index 565b9d4..110e691 100644
--- a/src/broken_links.py
+++ b/src/broken_links.py
@@ -7,16 +7,14 @@ import matplotlib.pyplot as plt
 DB = connect()
 
 DB.sql("""
-with cte as (
     select 
-        count(1) as cnt 
+        id
+        ,url
     from stories 
-    group by url, outlet
-)
-select
-    cast(sum(cnt) filter (where cnt = 1) as float)
-    / sum(cnt) filter (where cnt > 1) as dups
-from cte
+""")
+
+DB.sql("""
+    describe stories 
 """)
 
 sns.histplot(x=hist['cnt'])
diff --git a/src/word.py b/src/word.py
index 93ba245..b78e8ce 100644
--- a/src/word.py
+++ b/src/word.py
@@ -1,9 +1,10 @@
 import click
-from scipy.spatial import distance
 from transformers import AutoTokenizer, RobertaModel
 import numpy as np
-from model import Model
-from data import Data, from_db, connect
+from data import Data, from_db, connect, data_dir
+from tqdm import tqdm
+import torch
+from pathlib import Path
 
 @click.group()
 def cli():
@@ -27,33 +28,61 @@ def max_sequence():
 @cli.command()
 def train():
     table = from_db(Data.Titles)
-
-
     n_classes = 10
+
+@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
+@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
+@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
+def embed(chunks, embedding_dest, token_dest):
+    """ given titles, generate tokens and word embeddings and saves to disk """
+
+    # init models
+    device = torch.device('cuda:0')
     tokenizer = AutoTokenizer.from_pretrained("roberta-base")
     model = RobertaModel.from_pretrained("roberta-base")
+    model.to(device)
 
+    # load data
+    db = connect()
+    table = db.sql("""
+        select
+        title
+        from stories
+        order by id desc
+    """).df()
+
+    # normalize text
+    table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
+
+    # generate embeddings from list of titles
     def get_embeddings(titles):
         # create tokens, padding to max width 
-        tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=70, return_attention_mask = True, return_tensors = "pt")
-        outputs = model(**tokens)
-        return outputs.last_hidden_state[:, 0, :]
+        tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
+        tokens = tokens.to(device)
+        with torch.no_grad():
+            outputs = model(**tokens)
+            #outputs = outputs.to(torch.device('cpu'))
+        return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
 
-    titles = table['title'].apply(str).to_list()[:10]
-    get_embeddings(titles)
+    tokens = []
+    embeddings = []
+    chunks = np.array_split(table['title'].to_numpy(), chunks)
+    chunk_iter = tqdm(chunks, 'embedding')
+    for chunk in chunk_iter:
+        data = chunk.tolist()
+        token, embedding = get_embeddings(data)
+        arr = embedding.detach().numpy()
+        embeddings.append(arr)
+        tokens.append(token)
 
-    outputs.last_hidden_state[0][200:]
-    outputs.values().shape
-    model
-
-    # linear = torch.nn.Linear(model.config.hidden_size, n_classes)
-    # act = torch.nn.Sigmoid()
-
-    # model = Model()
-    classes = act(linear(pred_y.last_hidden_state[:, 0, :])).detach()
+    embeddings = np.concatenate(embeddings)
+    tokens = np.concatenate(tokens)
+    np.save(embedding_dest, embeddings)
+    np.save(token_dest, tokens)
 
 @cli.command()
 def distance():
+    """TODO: measure distance between sequence embeddings"""
     distances = distance.cdist(classes, classes, 'euclidean')
     np.fill_diagonal(distances, np.inf)
     min_index = (np.argmin(distances))