add function to extract emotional labels of titles.

This commit is contained in:
Matt 2023-04-27 16:40:33 -07:00
parent c38a5455a8
commit 3a6f97b290
4 changed files with 235 additions and 21 deletions

View File

@ -23,4 +23,9 @@ if __name__ == "__main__":
cli.add_command(mine.embeddings) cli.add_command(mine.embeddings)
cli.add_command(mine.cluster) cli.add_command(mine.cluster)
cli.add_command(mine.plot) cli.add_command(mine.plot)
import emotion
cli.add_command(emotion.extract)
cli.add_command(emotion.normalize)
cli.add_command(emotion.analyze)
cli.add_command(emotion.create_table)
cli() cli()

169
src/emotion.py Normal file
View File

@ -0,0 +1,169 @@
import click
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from model import BertForMultiLabelClassification
from data import connect
import seaborn as sns
import matplotlib.pyplot as plt
def data():
# load data
DB = connect()
table = DB.sql("""
SELECT
id,
title
FROM stories
WHERE id NOT IN (
SELECT
DISTINCT story_id
FROM story_emotions
)
ORDER BY id DESC
""").df()
DB.close()
return table
@click.command("emotion:create-table")
def create_table():
"""create the table to hold the title id and labels."""
DB = connect()
table = "story_emotions"
DB.execute("""
CREATE OR REPLACE TABLE {table}
(
story_id BIGINT,
label TEXT,
score REAL
)
""")
DB.close()
print(f"\"{table}\" created")
@click.command("emotion:extract")
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
def extract(chunks):
"""extract emotion class labels from titles and put them in the db"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")
model.to(device)
table = data()
chunked = np.array_split(table.to_numpy(), chunks)
for part in tqdm(chunked):
ids = [x[0] for x in part]
docs = [x[1] for x in part]
tokens = tokenizer(docs, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
tokens = tokens.to(device)
results = run(model, tokens, ids)
df = pd.DataFrame(results)
DB = connect()
DB.execute('INSERT INTO story_emotions SELECT * FROM df')
DB.close()
def run(model, tokens, ids):
threshold = 0.1
with torch.no_grad():
outputs = model(**tokens)[0].to('cpu').detach().numpy()
scores = 1 / (1 + np.exp(-outputs)) # Sigmoid
results = []
for i, item in enumerate(scores):
for idx, s in enumerate(item):
if s > threshold:
results.append({"story_id": ids[i], "label" : model.config.id2label[idx], "score": s})
return results
@click.command("emotion:normalize")
def normalize():
"""normalize the emotion tables."""
DB = connect()
DB.sql("""
CREATE OR REPLACE TABLE emotions AS
SELECT
row_number() over() as id
,e.label
,COUNT(1) AS stories
FROM story_emotions e
JOIN stories s
ON s.id = e.story_id
-- WHERE YEAR(s.published_at) < 2022
GROUP BY e.label
HAVING stories > 1000
ORDER BY stories DESC
""")
DB.sql("""
ALTER TABLE story_emotions
ADD COLUMN emotion_id int64
""")
DB.sql("""
UPDATE story_emotions
SET emotion_id = emotions.id
FROM emotions
WHERE emotions.label = story_emotions.label
""")
DB.sql("""
ALTER TABLE story_emotions
DROP COLUMN label
""")
DB.sql("""
SELECT
row_number() over() as id
,e.label
,COUNT(1) AS stories
FROM story_emotions e
JOIN stories s
ON s.id = e.story_id
-- WHERE YEAR(s.published_at) < 2022
GROUP BY e.label
HAVING stories > 1000
ORDER BY stories DESC
""")
DB.close()
@click.command("emotion:analyze")
def analyze():
"""plot and group emotional labels"""
DB = connect()
DB.sql("""
WITH grouped as (
SELECT
YEAR(s.published_at) as year
,e.label
,COUNT(1) AS stories
FROM story_emotions e
JOIN stories s
ON s.id = e.story_id
WHERE YEAR(s.published_at) < 2022
AND label = 'annoyance'
GROUP BY
YEAR(s.published_at)
,e.label
), total AS (
SELECT
e.label
,count(1) as total
FROM grouped s
JOIN story_emotions e
ON e.label = s.label
GROUP BY
e.label
)
SELECT
g.year
,g.label
,100 * (g.stories / CAST(t.total AS float)) AS frac
FROM grouped g
JOIN total t
ON t.label = g.label
ORDER BY g.label, g.year
""")
DB.close()
sns.lineplot(x=df['year'], y=df['frac'], hue=df['label'])
plt.show()

View File

@ -14,3 +14,50 @@ class Model(nn.Module):
outs = self.bert(**x) outs = self.bert(**x)
outs = self.act(self.linear(outs.last_hidden_state)) outs = self.act(self.linear(outs.last_hidden_state))
return outs return outs
import torch.nn as nn
from transformers import BertPreTrainedModel, BertModel
class BertForMultiLabelClassification(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.loss_fct = nn.BCEWithLogitsLoss()
self.init_weights()
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss = self.loss_fct(logits, labels)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)

View File

@ -31,13 +31,14 @@ def train():
@click.command(name="word:embed") @click.command(name="word:embed")
@click.option('-c', '--chunks', type=int, default=5000, show_default=True) @click.option('-c', '--chunks', type=int, default=5000, show_default=True)
@click.option('--embedding_dest', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'sequence_embeddings.npy'), show_default=True) @click.option('--embedding_dir', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'embeddings'), show_default=True)
@click.option('--token_dest', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'sequence_tokens.npy'), show_default=True) @click.option('--token_dir', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'tokens'), show_default=True)
def embed(chunks, embedding_dest, token_dest): @click.option('--device', help="device to process data on", type=str, default="cuda:0", show_default=True)
def embed(chunks, embedding_dir, token_dir, device):
""" given titles, generate tokens and word embeddings and saves to disk """ """ given titles, generate tokens and word embeddings and saves to disk """
# init models # init models
device = torch.device('cuda:0') device = torch.device(device)
tokenizer = AutoTokenizer.from_pretrained("roberta-base") tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base") model = RobertaModel.from_pretrained("roberta-base")
model.to(device) model.to(device)
@ -56,29 +57,21 @@ def embed(chunks, embedding_dest, token_dest):
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
# generate embeddings from list of titles # generate embeddings from list of titles
def get_embeddings(titles): chunks = np.array_split(table['title'].to_numpy(), chunks)
chunk_iter = tqdm(chunks, 'embedding')
for i, chunk in enumerate(chunk_iter):
# create tokens, padding to max width # create tokens, padding to max width
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt") tokens = tokenizer(chunk.tolist(), add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
tokens = tokens.to(device) tokens = tokens.to(device)
with torch.no_grad(): with torch.no_grad():
outputs = model(**tokens) outputs = model(**tokens)
return tokens.to(torch.device('cpu')), outputs.last_hidden_state.to(torch.device('cpu'))
tokens = [] # to disk
embeddings = [] hidden = outputs.last_hidden_state.to(torch.device('cpu')).detach().numpy()
chunks = np.array_split(table['title'].to_numpy(), chunks) np.save(embedding_dir / f"embedding_{i}.npy", hidden)
chunk_iter = tqdm(chunks, 'embedding')
for chunk in chunk_iter:
data = chunk.tolist()
token, embedding = get_embeddings(data)
arr = embedding.detach().numpy()
embeddings.append(arr)
tokens.append(token)
embeddings = np.concatenate(embeddings) tokens = tokens.to(torch.device('cpu'))
tokens = np.concatenate(tokens) np.save(token_dir / f"token_{i}.npy", tokens)
np.save(embedding_dest, embeddings)
np.save(token_dest, tokens)
@click.command(name="word:distance") @click.command(name="word:distance")
def distance(): def distance():