485 lines
14 KiB
Python
485 lines
14 KiB
Python
import click
|
|
from tqdm import tqdm
|
|
import torch
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
from transformers import BertTokenizer
|
|
from model import BertForMultiLabelClassification
|
|
from data.main import connect, data_dir
|
|
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
from matplotlib.dates import DateFormatter
|
|
import matplotlib.dates as mdates
|
|
|
|
def data():
|
|
|
|
# load data
|
|
DB = connect()
|
|
table = DB.sql("""
|
|
SELECT
|
|
id,
|
|
title
|
|
FROM stories
|
|
WHERE id NOT IN (
|
|
SELECT
|
|
DISTINCT story_id
|
|
FROM story_emotions
|
|
)
|
|
ORDER BY id DESC
|
|
""").df()
|
|
DB.close()
|
|
|
|
return table
|
|
|
|
@click.command("emotion:create-table")
|
|
def create_table():
|
|
"""create the table to hold the title id and labels."""
|
|
DB = connect()
|
|
table = "story_emotions"
|
|
DB.execute("""
|
|
CREATE OR REPLACE TABLE {table}
|
|
(
|
|
story_id BIGINT,
|
|
label TEXT,
|
|
score REAL
|
|
)
|
|
""")
|
|
DB.close()
|
|
print(f"\"{table}\" created")
|
|
|
|
@click.command("emotion:extract")
|
|
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
|
|
def extract(chunks):
|
|
"""extract emotion class labels from titles and put them in the db"""
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
|
|
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")
|
|
model.to(device)
|
|
|
|
table = data()
|
|
chunked = np.array_split(table.to_numpy(), chunks)
|
|
for part in tqdm(chunked):
|
|
ids = [x[0] for x in part]
|
|
docs = [x[1] for x in part]
|
|
tokens = tokenizer(docs, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
|
|
tokens = tokens.to(device)
|
|
results = run(model, tokens, ids)
|
|
df = pd.DataFrame(results)
|
|
DB = connect()
|
|
DB.execute('INSERT INTO story_emotions SELECT * FROM df')
|
|
DB.close()
|
|
|
|
def run(model, tokens, ids):
|
|
threshold = 0.1
|
|
with torch.no_grad():
|
|
outputs = model(**tokens)[0].to('cpu').detach().numpy()
|
|
scores = 1 / (1 + np.exp(-outputs)) # Sigmoid
|
|
results = []
|
|
for i, item in enumerate(scores):
|
|
for idx, s in enumerate(item):
|
|
if s > threshold:
|
|
results.append({"story_id": ids[i], "label" : model.config.id2label[idx], "score": s})
|
|
return results
|
|
|
|
@click.command("emotion:normalize")
|
|
def normalize():
|
|
"""normalize the emotion tables."""
|
|
DB = connect()
|
|
DB.sql("""
|
|
CREATE OR REPLACE TABLE emotions AS
|
|
SELECT
|
|
row_number() over() as id
|
|
,e.label
|
|
,COUNT(1) AS stories
|
|
FROM story_emotions e
|
|
JOIN stories s
|
|
ON s.id = e.story_id
|
|
-- WHERE YEAR(s.published_at) < 2022
|
|
GROUP BY e.label
|
|
HAVING stories > 1000
|
|
ORDER BY stories DESC
|
|
""")
|
|
DB.sql("""
|
|
ALTER TABLE story_emotions
|
|
ADD COLUMN emotion_id int64
|
|
""")
|
|
DB.sql("""
|
|
UPDATE story_emotions
|
|
SET emotion_id = emotions.id
|
|
FROM emotions
|
|
WHERE emotions.label = story_emotions.label
|
|
""")
|
|
DB.sql("""
|
|
ALTER TABLE story_emotions
|
|
DROP COLUMN label
|
|
""")
|
|
|
|
DB.sql("""
|
|
SELECT
|
|
row_number() over() as id
|
|
,e.label
|
|
,COUNT(1) AS stories
|
|
FROM story_emotions e
|
|
JOIN stories s
|
|
ON s.id = e.story_id
|
|
-- WHERE YEAR(s.published_at) < 2022
|
|
GROUP BY e.label
|
|
HAVING stories > 1000
|
|
ORDER BY stories DESC
|
|
""")
|
|
DB.close()
|
|
|
|
@click.command("emotion:analyze")
|
|
def coef_over_time():
|
|
"""plot and group emotional labels"""
|
|
DB = connect()
|
|
|
|
emotions = DB.sql("""
|
|
select label from emotions
|
|
""").df()
|
|
|
|
from sklearn import linear_model
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
def results(buckets = '1 month'):
|
|
results = DB.sql(f"""
|
|
with cte as (
|
|
SELECT
|
|
time_bucket(interval '{buckets}', s.published_at) as date
|
|
,e.label
|
|
,COUNT(1) AS stories
|
|
FROM stories s
|
|
JOIN story_emotions se
|
|
ON s.id = se.story_id
|
|
JOIN emotions e
|
|
ON e.id = se.emotion_id
|
|
WHERE YEAR(s.published_at) < 2022
|
|
GROUP BY
|
|
time_bucket(interval '{buckets}', s.published_at)
|
|
,e.label
|
|
)
|
|
,total as (
|
|
SELECT
|
|
time_bucket(interval '{buckets}', s.published_at) as date
|
|
,COUNT(1) AS stories
|
|
FROM stories s
|
|
WHERE YEAR(s.published_at) < 2022
|
|
GROUP BY
|
|
time_bucket(interval '{buckets}', s.published_at)
|
|
)
|
|
select
|
|
epoch(cte.date) / 60 / 60 / 24 / 365 as date
|
|
,cte.label
|
|
,cast(cte.stories as float) / t.stories as stories
|
|
from cte
|
|
join total t
|
|
on t.date = cte.date
|
|
""").df()
|
|
return results
|
|
|
|
|
|
def get_coef(label):
|
|
reg = linear_model.LinearRegression()
|
|
df = results[results['label'] == label]
|
|
x = df['date'].to_numpy().reshape(-1, 1)
|
|
y = df['stories']
|
|
x_train, x_test = train_test_split(x)
|
|
y_train, y_test = train_test_split(y)
|
|
reg.fit(x_train, y_train)
|
|
# y_pred = reg.predict(x_test)
|
|
# sns.lineplot(x=x_test.flatten(), y=y_pred)
|
|
return reg.coef_
|
|
|
|
collection = []
|
|
results = results('2 year')
|
|
for emotion in emotions['label']:
|
|
if emotion == 'neutral':
|
|
continue
|
|
coef = get_coef(emotion)[0]
|
|
if coef > 0:
|
|
increasing = True
|
|
else:
|
|
increasing = False
|
|
collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 })
|
|
pd.DataFrame(collection).sort_values('coef')
|
|
|
|
plt.show()
|
|
|
|
@click.command("emotion:analyze")
|
|
def analyze():
|
|
"""plot and group emotional labels"""
|
|
DB = connect()
|
|
|
|
emotions = DB.sql("""
|
|
select label from emotions
|
|
""").df()
|
|
|
|
from sklearn import linear_model
|
|
from sklearn.model_selection import train_test_split
|
|
def get_coef(emotion):
|
|
df = DB.sql("""
|
|
with cte as (
|
|
SELECT
|
|
time_bucket(interval '1 month', s.published_at) as date
|
|
,e.label
|
|
,COUNT(1) AS stories
|
|
FROM stories s
|
|
JOIN story_emotions se
|
|
ON s.id = se.story_id
|
|
JOIN emotions e
|
|
ON e.id = se.emotion_id
|
|
WHERE YEAR(s.published_at) < 2022
|
|
--AND e.label in ('neutral', 'annoyance')
|
|
AND e.label in ('sadness')
|
|
GROUP BY
|
|
time_bucket(interval '1 month', s.published_at)
|
|
,e.label
|
|
)
|
|
,total as (
|
|
SELECT
|
|
time_bucket(interval '1 month', s.published_at) as date
|
|
,COUNT(1) AS stories
|
|
FROM stories s
|
|
WHERE YEAR(s.published_at) < 2022
|
|
GROUP BY
|
|
time_bucket(interval '1 month', s.published_at)
|
|
)
|
|
select
|
|
epoch(cte.date) as date
|
|
,cte.label
|
|
--,total.stories as total
|
|
,cast(cte.stories as float) / e.stories as stories
|
|
from cte
|
|
join emotions e
|
|
--on total.date = cte.date
|
|
on e.label = cte.label
|
|
""").df()
|
|
|
|
reg = linear_model.LinearRegression()
|
|
x = df['date'].to_numpy().reshape(-1, 1)
|
|
y = df['stories']
|
|
|
|
x_train, x_test = train_test_split(x)
|
|
y_train, y_test = train_test_split(y)
|
|
reg.fit(x_train, y_train)
|
|
#y_pred = reg.predict(x_test)
|
|
return reg.coef_
|
|
|
|
|
|
df = DB.sql(f"""{yearly}""").df()
|
|
df['date'] = pd.to_datetime(df['date'])
|
|
ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label'])
|
|
#ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
|
|
plt.locator_params(axis='y', nbins=6)
|
|
ax.xaxis.set_major_formatter(DateFormatter("%m-%y"))
|
|
plt.show()
|
|
|
|
DB.sql("""
|
|
WITH grouped as (
|
|
), total AS (
|
|
SELECT
|
|
e.label
|
|
,count(1) as total
|
|
FROM grouped s
|
|
JOIN story_emotions e
|
|
ON e.label = s.label
|
|
GROUP BY
|
|
e.label
|
|
)
|
|
SELECT
|
|
g.year
|
|
,g.label
|
|
,100 * (g.stories / CAST(t.total AS float)) AS frac
|
|
FROM grouped g
|
|
JOIN total t
|
|
ON t.label = g.label
|
|
ORDER BY g.label, g.year
|
|
""")
|
|
DB.close()
|
|
|
|
sns.lineplot(x=df['year'], y=df['frac'], hue=df['label'])
|
|
plt.show()
|
|
|
|
def debug():
|
|
from transformers import pipeline
|
|
|
|
# load data
|
|
DB = connect()
|
|
table = DB.sql("""
|
|
SELECT
|
|
id,
|
|
title
|
|
FROM stories
|
|
ORDER BY id DESC
|
|
""").df()
|
|
DB.close()
|
|
|
|
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")
|
|
|
|
chunks = 5000
|
|
chunked = np.array_split(table, chunks)
|
|
labels = []
|
|
ids = []
|
|
for chunk in tqdm(chunked):
|
|
sentences = chunk['title'].tolist()
|
|
label_ids = chunk['id'].tolist()
|
|
with torch.no_grad():
|
|
emotions = classifier(sentences)
|
|
labels.append(emotions)
|
|
ids.append(label_ids)
|
|
out = pd.DataFrame(np.concatenate(labels).tolist())
|
|
out_ids = pd.DataFrame(np.concatenate(ids).tolist(), columns=['story_id'])
|
|
out = pd.concat([out_ids, out], axis=1)
|
|
|
|
DB = connect()
|
|
DB.sql("""
|
|
CREATE OR REPLACE TABLE story_emotions AS
|
|
SELECT
|
|
story_id
|
|
,label
|
|
,score
|
|
FROM out
|
|
""")
|
|
DB.sql("""
|
|
CREATE OR REPLACE TABLE emotions AS
|
|
SELECT
|
|
row_number() over() as id
|
|
,label
|
|
,count(1) as stories
|
|
FROM story_emotions
|
|
GROUP BY
|
|
label
|
|
""")
|
|
DB.sql("""
|
|
ALTER TABLE story_emotions add emotion_id bigint
|
|
""")
|
|
DB.sql("""
|
|
UPDATE story_emotions
|
|
SET emotion_id = emotions.id
|
|
FROM emotions
|
|
WHERE story_emotions.label = emotions.label
|
|
""")
|
|
DB.sql("""
|
|
ALTER TABLE story_emotions drop column label
|
|
""")
|
|
DB.sql("""
|
|
select
|
|
*
|
|
from emotions
|
|
""")
|
|
DB.sql("""
|
|
select
|
|
* from story_emotions
|
|
limit 4
|
|
""")
|
|
DB.close()
|
|
|
|
out.to_csv(data_dir() / 'emotions.csv', sep="|")
|
|
|
|
def another():
|
|
DB = connect()
|
|
|
|
DB.sql("""
|
|
select
|
|
*
|
|
from emotions
|
|
""")
|
|
|
|
DB.sql("""
|
|
select
|
|
*
|
|
from story_emotions
|
|
""")
|
|
|
|
emotions = DB.sql("""
|
|
SELECT
|
|
YEAR(s.published_at) AS year
|
|
,e.label AS emotion
|
|
,count(1) AS stories
|
|
FROM stories s
|
|
JOIN story_emotions se
|
|
ON s.id = se.story_id
|
|
JOIN emotions e
|
|
ON e.id = se.emotion_id
|
|
GROUP by
|
|
YEAR(s.published_at)
|
|
,e.label
|
|
""").df()
|
|
emotions
|
|
|
|
sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
|
|
plt.show()
|
|
|
|
pivot = emotions.pivot(index='year', columns='emotion', values='stories')
|
|
pivot.reset_index(inplace=True)
|
|
from sklearn.linear_model import LinearRegression
|
|
reg = LinearRegression()
|
|
|
|
for emotion in pivot.keys()[1:].tolist():
|
|
_ = reg.fit(pivot['year'].to_numpy().reshape(-1, 1), pivot[emotion])
|
|
print(f"{emotion}: {reg.coef_[0]}")
|
|
|
|
fig, ax = plt.subplots()
|
|
#sns.lineplot(x=pivot['anger'], y=pivot['joy'])
|
|
#sns.lineplot(x=pivot['anger'], y=pivot['surprise'], ax=ax)
|
|
sns.lineplot(x=pivot['anger'], y=pivot['fear'], ax=ax)
|
|
sns.lineplot(x=pivot[''], y=pivot['fear'], ax=ax)
|
|
plt.show()
|
|
|
|
DB.close()
|
|
|
|
normalized = DB.sql("""
|
|
with cte as (
|
|
select
|
|
year(s.published_at) as year
|
|
,se.label as emotion
|
|
,b.label as bias
|
|
from stories s
|
|
join story_emotions se
|
|
on s.id = se.story_id
|
|
join publisher_bias b
|
|
on b.id = s.publisher_id
|
|
where b.label != 'allsides'
|
|
and se.label != 'neutral'
|
|
)
|
|
select
|
|
distinct
|
|
year
|
|
,emotion
|
|
,bias
|
|
,cast(count(1) over(partition by year, bias, emotion) as float) / count(1) over(partition by year, bias) as group_count
|
|
from cte
|
|
""").df()
|
|
|
|
DB.sql("""
|
|
select
|
|
b.label as bias
|
|
,count(1) as stories
|
|
from stories s
|
|
join story_emotions se
|
|
on s.id = se.story_id
|
|
join publisher_bias b
|
|
on b.id = s.publisher_id
|
|
group by
|
|
b.label
|
|
""").df()
|
|
|
|
another_pivot = emotional_bias.pivot(index=['bias', 'year'], columns='emotion', values='stories')
|
|
another_pivot.reset_index(inplace=True)
|
|
|
|
sns.lineplot(data=normalized, x='year', y='group_count', hue='bias', style='emotion')
|
|
plt.show()
|
|
|
|
sns.relplot(
|
|
data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line"
|
|
#data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line", facet_kws=dict(sharey=False)
|
|
)
|
|
plt.show()
|
|
|
|
DB.sql("""
|
|
select
|
|
*
|
|
from another_pivot
|
|
""")
|