wwu-577/src/emotion.py

485 lines
14 KiB
Python

import click
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from model import BertForMultiLabelClassification
from data.main import connect, data_dir
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
def data():
# load data
DB = connect()
table = DB.sql("""
SELECT
id,
title
FROM stories
WHERE id NOT IN (
SELECT
DISTINCT story_id
FROM story_emotions
)
ORDER BY id DESC
""").df()
DB.close()
return table
@click.command("emotion:create-table")
def create_table():
"""create the table to hold the title id and labels."""
DB = connect()
table = "story_emotions"
DB.execute("""
CREATE OR REPLACE TABLE {table}
(
story_id BIGINT,
label TEXT,
score REAL
)
""")
DB.close()
print(f"\"{table}\" created")
@click.command("emotion:extract")
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
def extract(chunks):
"""extract emotion class labels from titles and put them in the db"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")
model.to(device)
table = data()
chunked = np.array_split(table.to_numpy(), chunks)
for part in tqdm(chunked):
ids = [x[0] for x in part]
docs = [x[1] for x in part]
tokens = tokenizer(docs, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
tokens = tokens.to(device)
results = run(model, tokens, ids)
df = pd.DataFrame(results)
DB = connect()
DB.execute('INSERT INTO story_emotions SELECT * FROM df')
DB.close()
def run(model, tokens, ids):
threshold = 0.1
with torch.no_grad():
outputs = model(**tokens)[0].to('cpu').detach().numpy()
scores = 1 / (1 + np.exp(-outputs)) # Sigmoid
results = []
for i, item in enumerate(scores):
for idx, s in enumerate(item):
if s > threshold:
results.append({"story_id": ids[i], "label" : model.config.id2label[idx], "score": s})
return results
@click.command("emotion:normalize")
def normalize():
"""normalize the emotion tables."""
DB = connect()
DB.sql("""
CREATE OR REPLACE TABLE emotions AS
SELECT
row_number() over() as id
,e.label
,COUNT(1) AS stories
FROM story_emotions e
JOIN stories s
ON s.id = e.story_id
-- WHERE YEAR(s.published_at) < 2022
GROUP BY e.label
HAVING stories > 1000
ORDER BY stories DESC
""")
DB.sql("""
ALTER TABLE story_emotions
ADD COLUMN emotion_id int64
""")
DB.sql("""
UPDATE story_emotions
SET emotion_id = emotions.id
FROM emotions
WHERE emotions.label = story_emotions.label
""")
DB.sql("""
ALTER TABLE story_emotions
DROP COLUMN label
""")
DB.sql("""
SELECT
row_number() over() as id
,e.label
,COUNT(1) AS stories
FROM story_emotions e
JOIN stories s
ON s.id = e.story_id
-- WHERE YEAR(s.published_at) < 2022
GROUP BY e.label
HAVING stories > 1000
ORDER BY stories DESC
""")
DB.close()
@click.command("emotion:analyze")
def coef_over_time():
"""plot and group emotional labels"""
DB = connect()
emotions = DB.sql("""
select label from emotions
""").df()
from sklearn import linear_model
from sklearn.model_selection import train_test_split
def results(buckets = '1 month'):
results = DB.sql(f"""
with cte as (
SELECT
time_bucket(interval '{buckets}', s.published_at) as date
,e.label
,COUNT(1) AS stories
FROM stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
WHERE YEAR(s.published_at) < 2022
GROUP BY
time_bucket(interval '{buckets}', s.published_at)
,e.label
)
,total as (
SELECT
time_bucket(interval '{buckets}', s.published_at) as date
,COUNT(1) AS stories
FROM stories s
WHERE YEAR(s.published_at) < 2022
GROUP BY
time_bucket(interval '{buckets}', s.published_at)
)
select
epoch(cte.date) / 60 / 60 / 24 / 365 as date
,cte.label
,cast(cte.stories as float) / t.stories as stories
from cte
join total t
on t.date = cte.date
""").df()
return results
def get_coef(label):
reg = linear_model.LinearRegression()
df = results[results['label'] == label]
x = df['date'].to_numpy().reshape(-1, 1)
y = df['stories']
x_train, x_test = train_test_split(x)
y_train, y_test = train_test_split(y)
reg.fit(x_train, y_train)
# y_pred = reg.predict(x_test)
# sns.lineplot(x=x_test.flatten(), y=y_pred)
return reg.coef_
collection = []
results = results('2 year')
for emotion in emotions['label']:
if emotion == 'neutral':
continue
coef = get_coef(emotion)[0]
if coef > 0:
increasing = True
else:
increasing = False
collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 })
pd.DataFrame(collection).sort_values('coef')
plt.show()
@click.command("emotion:analyze")
def analyze():
"""plot and group emotional labels"""
DB = connect()
emotions = DB.sql("""
select label from emotions
""").df()
from sklearn import linear_model
from sklearn.model_selection import train_test_split
def get_coef(emotion):
df = DB.sql("""
with cte as (
SELECT
time_bucket(interval '1 month', s.published_at) as date
,e.label
,COUNT(1) AS stories
FROM stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
WHERE YEAR(s.published_at) < 2022
--AND e.label in ('neutral', 'annoyance')
AND e.label in ('sadness')
GROUP BY
time_bucket(interval '1 month', s.published_at)
,e.label
)
,total as (
SELECT
time_bucket(interval '1 month', s.published_at) as date
,COUNT(1) AS stories
FROM stories s
WHERE YEAR(s.published_at) < 2022
GROUP BY
time_bucket(interval '1 month', s.published_at)
)
select
epoch(cte.date) as date
,cte.label
--,total.stories as total
,cast(cte.stories as float) / e.stories as stories
from cte
join emotions e
--on total.date = cte.date
on e.label = cte.label
""").df()
reg = linear_model.LinearRegression()
x = df['date'].to_numpy().reshape(-1, 1)
y = df['stories']
x_train, x_test = train_test_split(x)
y_train, y_test = train_test_split(y)
reg.fit(x_train, y_train)
#y_pred = reg.predict(x_test)
return reg.coef_
df = DB.sql(f"""{yearly}""").df()
df['date'] = pd.to_datetime(df['date'])
ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label'])
#ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
plt.locator_params(axis='y', nbins=6)
ax.xaxis.set_major_formatter(DateFormatter("%m-%y"))
plt.show()
DB.sql("""
WITH grouped as (
), total AS (
SELECT
e.label
,count(1) as total
FROM grouped s
JOIN story_emotions e
ON e.label = s.label
GROUP BY
e.label
)
SELECT
g.year
,g.label
,100 * (g.stories / CAST(t.total AS float)) AS frac
FROM grouped g
JOIN total t
ON t.label = g.label
ORDER BY g.label, g.year
""")
DB.close()
sns.lineplot(x=df['year'], y=df['frac'], hue=df['label'])
plt.show()
def debug():
from transformers import pipeline
# load data
DB = connect()
table = DB.sql("""
SELECT
id,
title
FROM stories
ORDER BY id DESC
""").df()
DB.close()
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")
chunks = 5000
chunked = np.array_split(table, chunks)
labels = []
ids = []
for chunk in tqdm(chunked):
sentences = chunk['title'].tolist()
label_ids = chunk['id'].tolist()
with torch.no_grad():
emotions = classifier(sentences)
labels.append(emotions)
ids.append(label_ids)
out = pd.DataFrame(np.concatenate(labels).tolist())
out_ids = pd.DataFrame(np.concatenate(ids).tolist(), columns=['story_id'])
out = pd.concat([out_ids, out], axis=1)
DB = connect()
DB.sql("""
CREATE OR REPLACE TABLE story_emotions AS
SELECT
story_id
,label
,score
FROM out
""")
DB.sql("""
CREATE OR REPLACE TABLE emotions AS
SELECT
row_number() over() as id
,label
,count(1) as stories
FROM story_emotions
GROUP BY
label
""")
DB.sql("""
ALTER TABLE story_emotions add emotion_id bigint
""")
DB.sql("""
UPDATE story_emotions
SET emotion_id = emotions.id
FROM emotions
WHERE story_emotions.label = emotions.label
""")
DB.sql("""
ALTER TABLE story_emotions drop column label
""")
DB.sql("""
select
*
from emotions
""")
DB.sql("""
select
* from story_emotions
limit 4
""")
DB.close()
out.to_csv(data_dir() / 'emotions.csv', sep="|")
def another():
DB = connect()
DB.sql("""
select
*
from emotions
""")
DB.sql("""
select
*
from story_emotions
""")
emotions = DB.sql("""
SELECT
YEAR(s.published_at) AS year
,e.label AS emotion
,count(1) AS stories
FROM stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
GROUP by
YEAR(s.published_at)
,e.label
""").df()
emotions
sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
plt.show()
pivot = emotions.pivot(index='year', columns='emotion', values='stories')
pivot.reset_index(inplace=True)
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
for emotion in pivot.keys()[1:].tolist():
_ = reg.fit(pivot['year'].to_numpy().reshape(-1, 1), pivot[emotion])
print(f"{emotion}: {reg.coef_[0]}")
fig, ax = plt.subplots()
#sns.lineplot(x=pivot['anger'], y=pivot['joy'])
#sns.lineplot(x=pivot['anger'], y=pivot['surprise'], ax=ax)
sns.lineplot(x=pivot['anger'], y=pivot['fear'], ax=ax)
sns.lineplot(x=pivot[''], y=pivot['fear'], ax=ax)
plt.show()
DB.close()
normalized = DB.sql("""
with cte as (
select
year(s.published_at) as year
,se.label as emotion
,b.label as bias
from stories s
join story_emotions se
on s.id = se.story_id
join publisher_bias b
on b.id = s.publisher_id
where b.label != 'allsides'
and se.label != 'neutral'
)
select
distinct
year
,emotion
,bias
,cast(count(1) over(partition by year, bias, emotion) as float) / count(1) over(partition by year, bias) as group_count
from cte
""").df()
DB.sql("""
select
b.label as bias
,count(1) as stories
from stories s
join story_emotions se
on s.id = se.story_id
join publisher_bias b
on b.id = s.publisher_id
group by
b.label
""").df()
another_pivot = emotional_bias.pivot(index=['bias', 'year'], columns='emotion', values='stories')
another_pivot.reset_index(inplace=True)
sns.lineplot(data=normalized, x='year', y='group_count', hue='bias', style='emotion')
plt.show()
sns.relplot(
data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line"
#data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line", facet_kws=dict(sharey=False)
)
plt.show()
DB.sql("""
select
*
from another_pivot
""")