add function to extract emotional labels of titles.

2023-04-27 16:40:33 -07:00 · 2023-04-27 16:40:33 -07:00 · 3a6f97b290
parent c38a5455a8
commit 3a6f97b290
4 changed files with 235 additions and 21 deletions
--- a/src/cli.py
+++ b/src/cli.py
@ -23,4 +23,9 @@ if __name__ == "__main__":
    cli.add_command(mine.embeddings)
    cli.add_command(mine.cluster)
    cli.add_command(mine.plot)
    import emotion
    cli.add_command(emotion.extract)
    cli.add_command(emotion.normalize)
    cli.add_command(emotion.analyze)
    cli.add_command(emotion.create_table)
    cli()
--- a/src/emotion.py
+++ b/src/emotion.py
@ -0,0 +1,169 @@
 import click
 from tqdm import tqdm
 import torch
 import pandas as pd
 import numpy as np
 from transformers import BertTokenizer
 from model import BertForMultiLabelClassification
 from data import connect
 import seaborn as sns
 import matplotlib.pyplot as plt
 def data():
    # load data
    DB = connect()
    table = DB.sql("""
        SELECT
            id,
            title
        FROM stories
        WHERE id NOT IN (
            SELECT 
                DISTINCT story_id 
            FROM story_emotions
        )
        ORDER BY id DESC
    """).df()
    DB.close()
    return table
@click.command("emotion:create-table")
 def create_table():
    """create the table to hold the title id and labels."""
    DB = connect()
    table = "story_emotions"
    DB.execute("""
        CREATE OR REPLACE TABLE {table}
        (
            story_id BIGINT,
            label TEXT,
            score REAL
        )
    """)
    DB.close()
    print(f"\"{table}\" created")
@click.command("emotion:extract")
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
 def extract(chunks):
    """extract emotion class labels from titles and put them in the db"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
    model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")
    model.to(device)
    table = data()
    chunked = np.array_split(table.to_numpy(), chunks)
    for part in tqdm(chunked):
        ids = [x[0] for x in part]
        docs = [x[1] for x in part]
        tokens = tokenizer(docs, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
        tokens = tokens.to(device)
        results = run(model, tokens, ids)
        df = pd.DataFrame(results)
        DB = connect()
        DB.execute('INSERT INTO story_emotions SELECT * FROM df')
        DB.close()
 def run(model, tokens, ids):
    threshold = 0.1
    with torch.no_grad():
        outputs = model(**tokens)[0].to('cpu').detach().numpy()
    scores = 1 / (1 + np.exp(-outputs))  # Sigmoid
    results = []
    for i, item in enumerate(scores):
        for idx, s in enumerate(item):
            if s > threshold:
                results.append({"story_id": ids[i], "label" : model.config.id2label[idx], "score": s})
    return results
@click.command("emotion:normalize")
 def normalize():
    """normalize the emotion tables."""
    DB = connect()
    DB.sql("""
        CREATE OR REPLACE TABLE emotions AS
        SELECT
            row_number() over() as id
            ,e.label
            ,COUNT(1) AS stories
        FROM story_emotions e
        JOIN stories s
        ON s.id = e.story_id
        -- WHERE YEAR(s.published_at) < 2022
        GROUP BY e.label
        HAVING stories > 1000
        ORDER BY stories DESC
    """)
    DB.sql("""
        ALTER TABLE story_emotions
        ADD COLUMN emotion_id int64
    """)
    DB.sql("""
        UPDATE story_emotions
        SET emotion_id = emotions.id
        FROM emotions
        WHERE emotions.label = story_emotions.label
    """)
    DB.sql("""
        ALTER TABLE story_emotions
        DROP COLUMN label
    """)
    DB.sql("""
        SELECT
            row_number() over() as id
            ,e.label
            ,COUNT(1) AS stories
        FROM story_emotions e
        JOIN stories s
        ON s.id = e.story_id
        -- WHERE YEAR(s.published_at) < 2022
        GROUP BY e.label
        HAVING stories > 1000
        ORDER BY stories DESC
    """)
    DB.close()
@click.command("emotion:analyze")
 def analyze():
    """plot and group emotional labels"""
    DB = connect()
    DB.sql("""
        WITH grouped as (
            SELECT
                YEAR(s.published_at) as year
                ,e.label
                ,COUNT(1) AS stories
            FROM story_emotions e
            JOIN stories s
            ON s.id = e.story_id
            WHERE YEAR(s.published_at) < 2022
            AND label = 'annoyance'
            GROUP BY 
                YEAR(s.published_at)
                ,e.label
        ), total AS (
            SELECT
                e.label
                ,count(1) as total
            FROM grouped s
            JOIN story_emotions e
            ON e.label = s.label
            GROUP BY
                e.label
        )
        SELECT
            g.year
            ,g.label
            ,100 * (g.stories / CAST(t.total AS float)) AS frac
        FROM grouped g
        JOIN total t
        ON t.label = g.label
        ORDER BY g.label, g.year
    """)
    DB.close()
    sns.lineplot(x=df['year'], y=df['frac'], hue=df['label'])
    plt.show()
--- a/src/model.py
+++ b/src/model.py
@ -14,3 +14,50 @@ class Model(nn.Module):
        outs = self.bert(**x)
        outs = self.act(self.linear(outs.last_hidden_state))
        return outs
 import torch.nn as nn
 from transformers import BertPreTrainedModel, BertModel
 class BertForMultiLabelClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
        self.loss_fct = nn.BCEWithLogitsLoss()
        self.init_weights()
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            loss = self.loss_fct(logits, labels)
            outputs = (loss,) + outputs
        return outputs  # (loss), logits, (hidden_states), (attentions)
--- a/src/word.py
+++ b/src/word.py
@ -31,13 +31,14 @@ def train():
@click.command(name="word:embed")
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
-@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True)
+@click.option('--embedding_dir', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'embeddings'), show_default=True)
-@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True)
+@click.option('--token_dir', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'tokens'), show_default=True)
-def embed(chunks, embedding_dest, token_dest):
+@click.option('--device', help="device to process data on", type=str, default="cuda:0", show_default=True)
 def embed(chunks, embedding_dir, token_dir, device):
    """ given titles, generate tokens and word embeddings and saves to disk """
    # init models
-    device = torch.device('cuda:0')
+    device = torch.device(device)
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    model = RobertaModel.from_pretrained("roberta-base")
    model.to(device)
@ -56,29 +57,21 @@ def embed(chunks, embedding_dest, token_dest):
    table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    # generate embeddings from list of titles
-    def get_embeddings(titles):
+    chunks = np.array_split(table['title'].to_numpy(), chunks)
    chunk_iter = tqdm(chunks, 'embedding')
    for i, chunk in enumerate(chunk_iter):
        # create tokens, padding to max width 
-        tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
+        tokens = tokenizer(chunk.tolist(), add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
        tokens = tokens.to(device)
        with torch.no_grad():
            outputs = model(**tokens)
        return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
-    tokens = []
+        # to disk
-    embeddings = []
+        hidden = outputs.last_hidden_state.to(torch.device('cpu')).detach().numpy()
-    chunks = np.array_split(table['title'].to_numpy(), chunks)
+        np.save(embedding_dir / f"embedding_{i}.npy", hidden)
    chunk_iter = tqdm(chunks, 'embedding')
    for chunk in chunk_iter:
        data = chunk.tolist()
        token, embedding = get_embeddings(data)
        arr = embedding.detach().numpy()
        embeddings.append(arr)
        tokens.append(token)
-    embeddings = np.concatenate(embeddings)
+        tokens = tokens.to(torch.device('cpu'))
-    tokens = np.concatenate(tokens)
+        np.save(token_dir / f"token_{i}.npy", tokens)
    np.save(embedding_dest, embeddings)
    np.save(token_dest, tokens)
@click.command(name="word:distance")
 def distance():