import click from tqdm import tqdm import torch import pandas as pd import numpy as np from transformers import BertTokenizer from model import BertForMultiLabelClassification from data.main import connect, data_dir import seaborn as sns import matplotlib.pyplot as plt from matplotlib.dates import DateFormatter import matplotlib.dates as mdates def data(): # load data DB = connect() table = DB.sql(""" SELECT id, title FROM stories WHERE id NOT IN ( SELECT DISTINCT story_id FROM story_emotions ) ORDER BY id DESC """).df() DB.close() return table @click.command("emotion:create-table") def create_table(): """create the table to hold the title id and labels.""" DB = connect() table = "story_emotions" DB.execute(""" CREATE OR REPLACE TABLE {table} ( story_id BIGINT, label TEXT, score REAL ) """) DB.close() print(f"\"{table}\" created") @click.command("emotion:extract") @click.option('-c', '--chunks', type=int, default=5000, show_default=True) def extract(chunks): """extract emotion class labels from titles and put them in the db""" device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original") model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original") model.to(device) table = data() chunked = np.array_split(table.to_numpy(), chunks) for part in tqdm(chunked): ids = [x[0] for x in part] docs = [x[1] for x in part] tokens = tokenizer(docs, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt") tokens = tokens.to(device) results = run(model, tokens, ids) df = pd.DataFrame(results) DB = connect() DB.execute('INSERT INTO story_emotions SELECT * FROM df') DB.close() def run(model, tokens, ids): threshold = 0.1 with torch.no_grad(): outputs = model(**tokens)[0].to('cpu').detach().numpy() scores = 1 / (1 + np.exp(-outputs)) # Sigmoid results = [] for i, item in enumerate(scores): for idx, s in enumerate(item): if s > threshold: results.append({"story_id": ids[i], "label" : model.config.id2label[idx], "score": s}) return results @click.command("emotion:normalize") def normalize(): """normalize the emotion tables.""" DB = connect() DB.sql(""" CREATE OR REPLACE TABLE emotions AS SELECT row_number() over() as id ,e.label ,COUNT(1) AS stories FROM story_emotions e JOIN stories s ON s.id = e.story_id -- WHERE YEAR(s.published_at) < 2022 GROUP BY e.label HAVING stories > 1000 ORDER BY stories DESC """) DB.sql(""" ALTER TABLE story_emotions ADD COLUMN emotion_id int64 """) DB.sql(""" UPDATE story_emotions SET emotion_id = emotions.id FROM emotions WHERE emotions.label = story_emotions.label """) DB.sql(""" ALTER TABLE story_emotions DROP COLUMN label """) DB.sql(""" SELECT row_number() over() as id ,e.label ,COUNT(1) AS stories FROM story_emotions e JOIN stories s ON s.id = e.story_id -- WHERE YEAR(s.published_at) < 2022 GROUP BY e.label HAVING stories > 1000 ORDER BY stories DESC """) DB.close() @click.command("emotion:analyze") def coef_over_time(): """plot and group emotional labels""" DB = connect() emotions = DB.sql(""" select label from emotions """).df() from sklearn import linear_model from sklearn.model_selection import train_test_split def results(buckets = '1 month'): results = DB.sql(f""" with cte as ( SELECT time_bucket(interval '{buckets}', s.published_at) as date ,e.label ,COUNT(1) AS stories FROM stories s JOIN story_emotions se ON s.id = se.story_id JOIN emotions e ON e.id = se.emotion_id WHERE YEAR(s.published_at) < 2022 GROUP BY time_bucket(interval '{buckets}', s.published_at) ,e.label ) ,total as ( SELECT time_bucket(interval '{buckets}', s.published_at) as date ,COUNT(1) AS stories FROM stories s WHERE YEAR(s.published_at) < 2022 GROUP BY time_bucket(interval '{buckets}', s.published_at) ) select epoch(cte.date) / 60 / 60 / 24 / 365 as date ,cte.label ,cast(cte.stories as float) / t.stories as stories from cte join total t on t.date = cte.date """).df() return results def get_coef(label): reg = linear_model.LinearRegression() df = results[results['label'] == label] x = df['date'].to_numpy().reshape(-1, 1) y = df['stories'] x_train, x_test = train_test_split(x) y_train, y_test = train_test_split(y) reg.fit(x_train, y_train) # y_pred = reg.predict(x_test) # sns.lineplot(x=x_test.flatten(), y=y_pred) return reg.coef_ collection = [] results = results('2 year') for emotion in emotions['label']: if emotion == 'neutral': continue coef = get_coef(emotion)[0] if coef > 0: increasing = True else: increasing = False collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 }) pd.DataFrame(collection).sort_values('coef') plt.show() @click.command("emotion:analyze") def analyze(): """plot and group emotional labels""" DB = connect() emotions = DB.sql(""" select label from emotions """).df() from sklearn import linear_model from sklearn.model_selection import train_test_split def get_coef(emotion): df = DB.sql(""" with cte as ( SELECT time_bucket(interval '1 month', s.published_at) as date ,e.label ,COUNT(1) AS stories FROM stories s JOIN story_emotions se ON s.id = se.story_id JOIN emotions e ON e.id = se.emotion_id WHERE YEAR(s.published_at) < 2022 --AND e.label in ('neutral', 'annoyance') AND e.label in ('sadness') GROUP BY time_bucket(interval '1 month', s.published_at) ,e.label ) ,total as ( SELECT time_bucket(interval '1 month', s.published_at) as date ,COUNT(1) AS stories FROM stories s WHERE YEAR(s.published_at) < 2022 GROUP BY time_bucket(interval '1 month', s.published_at) ) select epoch(cte.date) as date ,cte.label --,total.stories as total ,cast(cte.stories as float) / e.stories as stories from cte join emotions e --on total.date = cte.date on e.label = cte.label """).df() reg = linear_model.LinearRegression() x = df['date'].to_numpy().reshape(-1, 1) y = df['stories'] x_train, x_test = train_test_split(x) y_train, y_test = train_test_split(y) reg.fit(x_train, y_train) #y_pred = reg.predict(x_test) return reg.coef_ df = DB.sql(f"""{yearly}""").df() df['date'] = pd.to_datetime(df['date']) ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label']) #ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2)) plt.locator_params(axis='y', nbins=6) ax.xaxis.set_major_formatter(DateFormatter("%m-%y")) plt.show() DB.sql(""" WITH grouped as ( ), total AS ( SELECT e.label ,count(1) as total FROM grouped s JOIN story_emotions e ON e.label = s.label GROUP BY e.label ) SELECT g.year ,g.label ,100 * (g.stories / CAST(t.total AS float)) AS frac FROM grouped g JOIN total t ON t.label = g.label ORDER BY g.label, g.year """) DB.close() sns.lineplot(x=df['year'], y=df['frac'], hue=df['label']) plt.show() def debug(): from transformers import pipeline # load data DB = connect() table = DB.sql(""" SELECT id, title FROM stories ORDER BY id DESC """).df() DB.close() classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base") chunks = 5000 chunked = np.array_split(table, chunks) labels = [] ids = [] for chunk in tqdm(chunked): sentences = chunk['title'].tolist() label_ids = chunk['id'].tolist() with torch.no_grad(): emotions = classifier(sentences) labels.append(emotions) ids.append(label_ids) out = pd.DataFrame(np.concatenate(labels).tolist()) out_ids = pd.DataFrame(np.concatenate(ids).tolist(), columns=['story_id']) out = pd.concat([out_ids, out], axis=1) DB = connect() DB.sql(""" CREATE OR REPLACE TABLE story_emotions AS SELECT story_id ,label ,score FROM out """) DB.sql(""" CREATE OR REPLACE TABLE emotions AS SELECT row_number() over() as id ,label ,count(1) as stories FROM story_emotions GROUP BY label """) DB.sql(""" ALTER TABLE story_emotions add emotion_id bigint """) DB.sql(""" UPDATE story_emotions SET emotion_id = emotions.id FROM emotions WHERE story_emotions.label = emotions.label """) DB.sql(""" ALTER TABLE story_emotions drop column label """) DB.sql(""" select * from emotions """) DB.sql(""" select * from story_emotions limit 4 """) DB.close() out.to_csv(data_dir() / 'emotions.csv', sep="|") def another(): DB = connect() DB.sql(""" select * from emotions """) DB.sql(""" select * from story_emotions """) emotions = DB.sql(""" SELECT YEAR(s.published_at) AS year ,e.label AS emotion ,count(1) AS stories FROM stories s JOIN story_emotions se ON s.id = se.story_id JOIN emotions e ON e.id = se.emotion_id GROUP by YEAR(s.published_at) ,e.label """).df() emotions sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion']) plt.show() pivot = emotions.pivot(index='year', columns='emotion', values='stories') pivot.reset_index(inplace=True) from sklearn.linear_model import LinearRegression reg = LinearRegression() for emotion in pivot.keys()[1:].tolist(): _ = reg.fit(pivot['year'].to_numpy().reshape(-1, 1), pivot[emotion]) print(f"{emotion}: {reg.coef_[0]}") fig, ax = plt.subplots() #sns.lineplot(x=pivot['anger'], y=pivot['joy']) #sns.lineplot(x=pivot['anger'], y=pivot['surprise'], ax=ax) sns.lineplot(x=pivot['anger'], y=pivot['fear'], ax=ax) sns.lineplot(x=pivot[''], y=pivot['fear'], ax=ax) plt.show() DB.close() normalized = DB.sql(""" with cte as ( select year(s.published_at) as year ,se.label as emotion ,b.label as bias from stories s join story_emotions se on s.id = se.story_id join publisher_bias b on b.id = s.publisher_id where b.label != 'allsides' and se.label != 'neutral' ) select distinct year ,emotion ,bias ,cast(count(1) over(partition by year, bias, emotion) as float) / count(1) over(partition by year, bias) as group_count from cte """).df() DB.sql(""" select b.label as bias ,count(1) as stories from stories s join story_emotions se on s.id = se.story_id join publisher_bias b on b.id = s.publisher_id group by b.label """).df() another_pivot = emotional_bias.pivot(index=['bias', 'year'], columns='emotion', values='stories') another_pivot.reset_index(inplace=True) sns.lineplot(data=normalized, x='year', y='group_count', hue='bias', style='emotion') plt.show() sns.relplot( data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line" #data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line", facet_kws=dict(sharey=False) ) plt.show() DB.sql(""" select * from another_pivot """)