rough draft of paper done.
This commit is contained in:
@@ -2,9 +2,24 @@ import data.main
|
||||
import data.scrape
|
||||
import data.factcheck
|
||||
import data.links
|
||||
import data.bias
|
||||
import data.emotion
|
||||
import data.broken_links
|
||||
import data.selection
|
||||
import data.sentence
|
||||
import data.sentiment
|
||||
import data.word
|
||||
|
||||
__all__ = [
|
||||
'main'
|
||||
,'scrape'
|
||||
,'factcheck'
|
||||
,'links'
|
||||
,'bias'
|
||||
,'emotion'
|
||||
,'broken_links'
|
||||
,'selection'
|
||||
,'sentence'
|
||||
,'sentiment'
|
||||
,'word'
|
||||
]
|
||||
|
||||
146
src/data/bias.py
Normal file
146
src/data/bias.py
Normal file
@@ -0,0 +1,146 @@
|
||||
import click
|
||||
from data.main import connect, paths
|
||||
import pandas as pd
|
||||
from lxml import etree
|
||||
from pathlib import Path
|
||||
import os
|
||||
import csv
|
||||
|
||||
|
||||
@click.command(name="bias:normalize")
|
||||
def normalize() -> None:
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
CREATE OR REPLACE TABLE publisher_bias AS
|
||||
WITH cte AS (
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,b.id as bias_id
|
||||
,b.bias as label
|
||||
,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
|
||||
FROM bias_ratings b
|
||||
JOIN top.publishers p
|
||||
ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
|
||||
),ranked AS (
|
||||
SELECT
|
||||
publisher_id
|
||||
,bias_id
|
||||
,label
|
||||
,similarity
|
||||
,ROW_NUMBER() OVER(PARTITION BY publisher_id ORDER BY similarity DESC) AS rn
|
||||
FROM cte
|
||||
)
|
||||
SELECT
|
||||
publisher_id
|
||||
,label
|
||||
,bias_id
|
||||
FROM ranked
|
||||
WHERE ranked.rn = 1
|
||||
""")
|
||||
|
||||
mapping = [
|
||||
{'label' :'left' , 'ordinal': -2},
|
||||
{'label' :'left-center' , 'ordinal': -1},
|
||||
{'label' :'center' , 'ordinal': 0},
|
||||
{'label' :'right-center' , 'ordinal': 1},
|
||||
{'label' :'right' , 'ordinal': 2},
|
||||
]
|
||||
mapping = pd.DataFrame(mapping)
|
||||
|
||||
with connect() as db:
|
||||
db.query("alter table bias_ratings add column ordinal int")
|
||||
db.query("""
|
||||
update bias_ratings b
|
||||
set ordinal = o.ordinal
|
||||
FROM mapping o
|
||||
WHERE o.label = b.bias
|
||||
""")
|
||||
|
||||
|
||||
@click.command(name='bias:parse')
|
||||
def parse() -> None:
|
||||
"""parse the save html page of allslides.com bias ratings into a normalized csv file"""
|
||||
bias_html = paths('data') / 'allsides.html'
|
||||
|
||||
parser = etree.HTMLParser()
|
||||
tree = etree.parse(str(bias_html), parser)
|
||||
root = tree.getroot()
|
||||
rows = root.xpath('//table[contains(@class,"views-table")]/tbody/tr')
|
||||
|
||||
ratings = []
|
||||
for row in rows:
|
||||
rating = dict()
|
||||
publisher = row.xpath('./td[contains(@class, "source-title")]/a')[0].text
|
||||
rating['publisher'] = publisher
|
||||
|
||||
bias = row.xpath('./td[contains(@class, "views-field-field-bias-image")]/a')[0].get('href')
|
||||
bias = bias.split('/')[-1]
|
||||
rating['bias'] = bias
|
||||
|
||||
agree = row.xpath('.//span[contains(@class, "agree")]')[0].text
|
||||
disagree = row.xpath('.//span[contains(@class, "disagree")]')[0].text
|
||||
|
||||
rating['agree'] = int(agree)
|
||||
rating['disagree'] = int(disagree)
|
||||
ratings.append(rating)
|
||||
df = pd.DataFrame(ratings)
|
||||
df.to_csv(paths('data') / 'bias_ratings.csv', sep="|", index=False, quoting=csv.QUOTE_NONNUMERIC)
|
||||
|
||||
@click.command(name="bias:load")
|
||||
def load() -> None:
|
||||
f = str(paths('data') / "bias_ratings.csv")
|
||||
|
||||
with connect() as db:
|
||||
db.sql(f"""
|
||||
CREATE TABLE bias_ratings as
|
||||
select
|
||||
row_number() over(order by b.publisher) as id
|
||||
,b.*
|
||||
from read_csv_auto('{f}') b
|
||||
""")
|
||||
|
||||
@click.command('bias:export')
|
||||
def export():
|
||||
with connect() as db:
|
||||
all_bias = db.query("""
|
||||
SELECT
|
||||
id as bias_id
|
||||
,publisher as name
|
||||
,bias as label
|
||||
FROM bias_ratings
|
||||
ORDER by agree desc
|
||||
""")
|
||||
|
||||
all_bias.df().to_csv(paths('data') / 'TMP_publisher_bias.csv', sep="|", index=False)
|
||||
with connect() as db:
|
||||
mapped_bias = db.query("""
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,p.name as name
|
||||
,p.tld as tld
|
||||
,b.label as bias
|
||||
,b.bias_id as bias_id
|
||||
FROM top.publishers p
|
||||
LEFT JOIN publisher_bias b
|
||||
ON b.publisher_id = p.id
|
||||
""")
|
||||
mapped_bias.df().to_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|", index=False)
|
||||
|
||||
@click.command('bias:import-mapped')
|
||||
def import_mapped():
|
||||
table_name = "top.publisher_bias"
|
||||
|
||||
df = pd.read_csv(paths('data') / 'TMP_publisher_bias_to_load.csv', sep="|")
|
||||
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
publisher_id AS publisher_id
|
||||
,cast(bias_id AS int) as bias_id
|
||||
FROM df
|
||||
WHERE bias_id IS NOT NULL
|
||||
""")
|
||||
|
||||
print(f"created table: {table_name}")
|
||||
|
||||
40
src/data/broken_links.py
Normal file
40
src/data/broken_links.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import requests
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import click
|
||||
|
||||
from data.main import connect
|
||||
|
||||
@click.command(name="broken:crawl")
|
||||
def crawl():
|
||||
"""crawl story urls checking for link rot or redirects."""
|
||||
with connect() as db:
|
||||
urls = db.query("""
|
||||
select
|
||||
id
|
||||
,url
|
||||
from stories
|
||||
order by published_at asc
|
||||
limit 5
|
||||
""").fetchall()
|
||||
|
||||
story_id, url = urls[1]
|
||||
# url
|
||||
responses = []
|
||||
for story_id, url in urls:
|
||||
out = {'story_id' : story_id, 'final_url' : url, 'timeout' : 0, 'status_code' : 200, 'content_length' : 0}
|
||||
try:
|
||||
response = requests.get(url, verify=False, timeout=10)
|
||||
if len(response.history) > 1:
|
||||
out['redirect'] = 1
|
||||
if url != response.url:
|
||||
out['final_url'] = response.url
|
||||
out['status_code'] = response.status_code
|
||||
out['content_length'] = len(response.content)
|
||||
except requests.exceptions.ReadTimeout as e:
|
||||
print(f"timeout: {url}")
|
||||
out['timeout'] = 1
|
||||
responses.append(out)
|
||||
|
||||
sns.histplot(x=hist['cnt'])
|
||||
plt.show()
|
||||
484
src/data/emotion.py
Normal file
484
src/data/emotion.py
Normal file
@@ -0,0 +1,484 @@
|
||||
import click
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from transformers import BertTokenizer
|
||||
from train.model import BertForMultiLabelClassification
|
||||
from data.main import connect, data_dir
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.dates import DateFormatter
|
||||
import matplotlib.dates as mdates
|
||||
|
||||
def data():
|
||||
|
||||
# load data
|
||||
DB = connect()
|
||||
table = DB.sql("""
|
||||
SELECT
|
||||
id,
|
||||
title
|
||||
FROM stories
|
||||
WHERE id NOT IN (
|
||||
SELECT
|
||||
DISTINCT story_id
|
||||
FROM story_emotions
|
||||
)
|
||||
ORDER BY id DESC
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
return table
|
||||
|
||||
@click.command("emotion:create-table")
|
||||
def create_table():
|
||||
"""create the table to hold the title id and labels."""
|
||||
DB = connect()
|
||||
table = "story_emotions"
|
||||
DB.execute("""
|
||||
CREATE OR REPLACE TABLE {table}
|
||||
(
|
||||
story_id BIGINT,
|
||||
label TEXT,
|
||||
score REAL
|
||||
)
|
||||
""")
|
||||
DB.close()
|
||||
print(f"\"{table}\" created")
|
||||
|
||||
@click.command("emotion:extract")
|
||||
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
|
||||
def extract(chunks):
|
||||
"""extract emotion class labels from titles and put them in the db"""
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
|
||||
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")
|
||||
model.to(device)
|
||||
|
||||
table = data()
|
||||
chunked = np.array_split(table.to_numpy(), chunks)
|
||||
for part in tqdm(chunked):
|
||||
ids = [x[0] for x in part]
|
||||
docs = [x[1] for x in part]
|
||||
tokens = tokenizer(docs, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
|
||||
tokens = tokens.to(device)
|
||||
results = run(model, tokens, ids)
|
||||
df = pd.DataFrame(results)
|
||||
DB = connect()
|
||||
DB.execute('INSERT INTO story_emotions SELECT * FROM df')
|
||||
DB.close()
|
||||
|
||||
def run(model, tokens, ids):
|
||||
threshold = 0.1
|
||||
with torch.no_grad():
|
||||
outputs = model(**tokens)[0].to('cpu').detach().numpy()
|
||||
scores = 1 / (1 + np.exp(-outputs)) # Sigmoid
|
||||
results = []
|
||||
for i, item in enumerate(scores):
|
||||
for idx, s in enumerate(item):
|
||||
if s > threshold:
|
||||
results.append({"story_id": ids[i], "label" : model.config.id2label[idx], "score": s})
|
||||
return results
|
||||
|
||||
@click.command("emotion:normalize")
|
||||
def normalize():
|
||||
"""normalize the emotion tables."""
|
||||
DB = connect()
|
||||
DB.sql("""
|
||||
CREATE OR REPLACE TABLE emotions AS
|
||||
SELECT
|
||||
row_number() over() as id
|
||||
,e.label
|
||||
,COUNT(1) AS stories
|
||||
FROM story_emotions e
|
||||
JOIN stories s
|
||||
ON s.id = e.story_id
|
||||
-- WHERE YEAR(s.published_at) < 2022
|
||||
GROUP BY e.label
|
||||
HAVING stories > 1000
|
||||
ORDER BY stories DESC
|
||||
""")
|
||||
DB.sql("""
|
||||
ALTER TABLE story_emotions
|
||||
ADD COLUMN emotion_id int64
|
||||
""")
|
||||
DB.sql("""
|
||||
UPDATE story_emotions
|
||||
SET emotion_id = emotions.id
|
||||
FROM emotions
|
||||
WHERE emotions.label = story_emotions.label
|
||||
""")
|
||||
DB.sql("""
|
||||
ALTER TABLE story_emotions
|
||||
DROP COLUMN label
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
SELECT
|
||||
row_number() over() as id
|
||||
,e.label
|
||||
,COUNT(1) AS stories
|
||||
FROM story_emotions e
|
||||
JOIN stories s
|
||||
ON s.id = e.story_id
|
||||
-- WHERE YEAR(s.published_at) < 2022
|
||||
GROUP BY e.label
|
||||
HAVING stories > 1000
|
||||
ORDER BY stories DESC
|
||||
""")
|
||||
DB.close()
|
||||
|
||||
@click.command("emotion:analyze")
|
||||
def coef_over_time():
|
||||
"""plot and group emotional labels"""
|
||||
DB = connect()
|
||||
|
||||
emotions = DB.sql("""
|
||||
select label from emotions
|
||||
""").df()
|
||||
|
||||
from sklearn import linear_model
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
def results(buckets = '1 month'):
|
||||
results = DB.sql(f"""
|
||||
with cte as (
|
||||
SELECT
|
||||
time_bucket(interval '{buckets}', s.published_at) as date
|
||||
,e.label
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
JOIN story_emotions se
|
||||
ON s.id = se.story_id
|
||||
JOIN emotions e
|
||||
ON e.id = se.emotion_id
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
GROUP BY
|
||||
time_bucket(interval '{buckets}', s.published_at)
|
||||
,e.label
|
||||
)
|
||||
,total as (
|
||||
SELECT
|
||||
time_bucket(interval '{buckets}', s.published_at) as date
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
GROUP BY
|
||||
time_bucket(interval '{buckets}', s.published_at)
|
||||
)
|
||||
select
|
||||
epoch(cte.date) / 60 / 60 / 24 / 365 as date
|
||||
,cte.label
|
||||
,cast(cte.stories as float) / t.stories as stories
|
||||
from cte
|
||||
join total t
|
||||
on t.date = cte.date
|
||||
""").df()
|
||||
return results
|
||||
|
||||
|
||||
def get_coef(label):
|
||||
reg = linear_model.LinearRegression()
|
||||
df = results[results['label'] == label]
|
||||
x = df['date'].to_numpy().reshape(-1, 1)
|
||||
y = df['stories']
|
||||
x_train, x_test = train_test_split(x)
|
||||
y_train, y_test = train_test_split(y)
|
||||
reg.fit(x_train, y_train)
|
||||
# y_pred = reg.predict(x_test)
|
||||
# sns.lineplot(x=x_test.flatten(), y=y_pred)
|
||||
return reg.coef_
|
||||
|
||||
collection = []
|
||||
results = results('2 year')
|
||||
for emotion in emotions['label']:
|
||||
if emotion == 'neutral':
|
||||
continue
|
||||
coef = get_coef(emotion)[0]
|
||||
if coef > 0:
|
||||
increasing = True
|
||||
else:
|
||||
increasing = False
|
||||
collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 })
|
||||
pd.DataFrame(collection).sort_values('coef')
|
||||
|
||||
plt.show()
|
||||
|
||||
@click.command("emotion:analyze")
|
||||
def analyze():
|
||||
"""plot and group emotional labels"""
|
||||
DB = connect()
|
||||
|
||||
emotions = DB.sql("""
|
||||
select label from emotions
|
||||
""").df()
|
||||
|
||||
from sklearn import linear_model
|
||||
from sklearn.model_selection import train_test_split
|
||||
def get_coef(emotion):
|
||||
df = DB.sql("""
|
||||
with cte as (
|
||||
SELECT
|
||||
time_bucket(interval '1 month', s.published_at) as date
|
||||
,e.label
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
JOIN story_emotions se
|
||||
ON s.id = se.story_id
|
||||
JOIN emotions e
|
||||
ON e.id = se.emotion_id
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
--AND e.label in ('neutral', 'annoyance')
|
||||
AND e.label in ('sadness')
|
||||
GROUP BY
|
||||
time_bucket(interval '1 month', s.published_at)
|
||||
,e.label
|
||||
)
|
||||
,total as (
|
||||
SELECT
|
||||
time_bucket(interval '1 month', s.published_at) as date
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
GROUP BY
|
||||
time_bucket(interval '1 month', s.published_at)
|
||||
)
|
||||
select
|
||||
epoch(cte.date) as date
|
||||
,cte.label
|
||||
--,total.stories as total
|
||||
,cast(cte.stories as float) / e.stories as stories
|
||||
from cte
|
||||
join emotions e
|
||||
--on total.date = cte.date
|
||||
on e.label = cte.label
|
||||
""").df()
|
||||
|
||||
reg = linear_model.LinearRegression()
|
||||
x = df['date'].to_numpy().reshape(-1, 1)
|
||||
y = df['stories']
|
||||
|
||||
x_train, x_test = train_test_split(x)
|
||||
y_train, y_test = train_test_split(y)
|
||||
reg.fit(x_train, y_train)
|
||||
#y_pred = reg.predict(x_test)
|
||||
return reg.coef_
|
||||
|
||||
|
||||
df = DB.sql(f"""{yearly}""").df()
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label'])
|
||||
#ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
|
||||
plt.locator_params(axis='y', nbins=6)
|
||||
ax.xaxis.set_major_formatter(DateFormatter("%m-%y"))
|
||||
plt.show()
|
||||
|
||||
DB.sql("""
|
||||
WITH grouped as (
|
||||
), total AS (
|
||||
SELECT
|
||||
e.label
|
||||
,count(1) as total
|
||||
FROM grouped s
|
||||
JOIN story_emotions e
|
||||
ON e.label = s.label
|
||||
GROUP BY
|
||||
e.label
|
||||
)
|
||||
SELECT
|
||||
g.year
|
||||
,g.label
|
||||
,100 * (g.stories / CAST(t.total AS float)) AS frac
|
||||
FROM grouped g
|
||||
JOIN total t
|
||||
ON t.label = g.label
|
||||
ORDER BY g.label, g.year
|
||||
""")
|
||||
DB.close()
|
||||
|
||||
sns.lineplot(x=df['year'], y=df['frac'], hue=df['label'])
|
||||
plt.show()
|
||||
|
||||
def debug():
|
||||
from transformers import pipeline
|
||||
|
||||
# load data
|
||||
DB = connect()
|
||||
table = DB.sql("""
|
||||
SELECT
|
||||
id,
|
||||
title
|
||||
FROM stories
|
||||
ORDER BY id DESC
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")
|
||||
|
||||
chunks = 5000
|
||||
chunked = np.array_split(table, chunks)
|
||||
labels = []
|
||||
ids = []
|
||||
for chunk in tqdm(chunked):
|
||||
sentences = chunk['title'].tolist()
|
||||
label_ids = chunk['id'].tolist()
|
||||
with torch.no_grad():
|
||||
emotions = classifier(sentences)
|
||||
labels.append(emotions)
|
||||
ids.append(label_ids)
|
||||
out = pd.DataFrame(np.concatenate(labels).tolist())
|
||||
out_ids = pd.DataFrame(np.concatenate(ids).tolist(), columns=['story_id'])
|
||||
out = pd.concat([out_ids, out], axis=1)
|
||||
|
||||
DB = connect()
|
||||
DB.sql("""
|
||||
CREATE OR REPLACE TABLE story_emotions AS
|
||||
SELECT
|
||||
story_id
|
||||
,label
|
||||
,score
|
||||
FROM out
|
||||
""")
|
||||
DB.sql("""
|
||||
CREATE OR REPLACE TABLE emotions AS
|
||||
SELECT
|
||||
row_number() over() as id
|
||||
,label
|
||||
,count(1) as stories
|
||||
FROM story_emotions
|
||||
GROUP BY
|
||||
label
|
||||
""")
|
||||
DB.sql("""
|
||||
ALTER TABLE story_emotions add emotion_id bigint
|
||||
""")
|
||||
DB.sql("""
|
||||
UPDATE story_emotions
|
||||
SET emotion_id = emotions.id
|
||||
FROM emotions
|
||||
WHERE story_emotions.label = emotions.label
|
||||
""")
|
||||
DB.sql("""
|
||||
ALTER TABLE story_emotions drop column label
|
||||
""")
|
||||
DB.sql("""
|
||||
select
|
||||
*
|
||||
from emotions
|
||||
""")
|
||||
DB.sql("""
|
||||
select
|
||||
* from story_emotions
|
||||
limit 4
|
||||
""")
|
||||
DB.close()
|
||||
|
||||
out.to_csv(data_dir() / 'emotions.csv', sep="|")
|
||||
|
||||
def another():
|
||||
DB = connect()
|
||||
|
||||
DB.sql("""
|
||||
select
|
||||
*
|
||||
from emotions
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
select
|
||||
*
|
||||
from story_emotions
|
||||
""")
|
||||
|
||||
emotions = DB.sql("""
|
||||
SELECT
|
||||
YEAR(s.published_at) AS year
|
||||
,e.label AS emotion
|
||||
,count(1) AS stories
|
||||
FROM stories s
|
||||
JOIN story_emotions se
|
||||
ON s.id = se.story_id
|
||||
JOIN emotions e
|
||||
ON e.id = se.emotion_id
|
||||
GROUP by
|
||||
YEAR(s.published_at)
|
||||
,e.label
|
||||
""").df()
|
||||
emotions
|
||||
|
||||
sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
|
||||
plt.show()
|
||||
|
||||
pivot = emotions.pivot(index='year', columns='emotion', values='stories')
|
||||
pivot.reset_index(inplace=True)
|
||||
from sklearn.linear_model import LinearRegression
|
||||
reg = LinearRegression()
|
||||
|
||||
for emotion in pivot.keys()[1:].tolist():
|
||||
_ = reg.fit(pivot['year'].to_numpy().reshape(-1, 1), pivot[emotion])
|
||||
print(f"{emotion}: {reg.coef_[0]}")
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
#sns.lineplot(x=pivot['anger'], y=pivot['joy'])
|
||||
#sns.lineplot(x=pivot['anger'], y=pivot['surprise'], ax=ax)
|
||||
sns.lineplot(x=pivot['anger'], y=pivot['fear'], ax=ax)
|
||||
sns.lineplot(x=pivot[''], y=pivot['fear'], ax=ax)
|
||||
plt.show()
|
||||
|
||||
DB.close()
|
||||
|
||||
normalized = DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
year(s.published_at) as year
|
||||
,se.label as emotion
|
||||
,b.label as bias
|
||||
from stories s
|
||||
join story_emotions se
|
||||
on s.id = se.story_id
|
||||
join publisher_bias b
|
||||
on b.id = s.publisher_id
|
||||
where b.label != 'allsides'
|
||||
and se.label != 'neutral'
|
||||
)
|
||||
select
|
||||
distinct
|
||||
year
|
||||
,emotion
|
||||
,bias
|
||||
,cast(count(1) over(partition by year, bias, emotion) as float) / count(1) over(partition by year, bias) as group_count
|
||||
from cte
|
||||
""").df()
|
||||
|
||||
DB.sql("""
|
||||
select
|
||||
b.label as bias
|
||||
,count(1) as stories
|
||||
from stories s
|
||||
join story_emotions se
|
||||
on s.id = se.story_id
|
||||
join publisher_bias b
|
||||
on b.id = s.publisher_id
|
||||
group by
|
||||
b.label
|
||||
""").df()
|
||||
|
||||
another_pivot = emotional_bias.pivot(index=['bias', 'year'], columns='emotion', values='stories')
|
||||
another_pivot.reset_index(inplace=True)
|
||||
|
||||
sns.lineplot(data=normalized, x='year', y='group_count', hue='bias', style='emotion')
|
||||
plt.show()
|
||||
|
||||
sns.relplot(
|
||||
data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line"
|
||||
#data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line", facet_kws=dict(sharey=False)
|
||||
)
|
||||
plt.show()
|
||||
|
||||
DB.sql("""
|
||||
select
|
||||
*
|
||||
from another_pivot
|
||||
""")
|
||||
@@ -8,7 +8,7 @@ from pathlib import Path
|
||||
import os
|
||||
import sys
|
||||
import click
|
||||
from data.main import connect, map_tld, paths
|
||||
from data.main import connect, map_tld, paths, reporting_label_to_int
|
||||
from random import randint
|
||||
from time import sleep
|
||||
from tqdm import tqdm
|
||||
@@ -155,7 +155,7 @@ def create_tables():
|
||||
FROM stories s
|
||||
""").df()
|
||||
|
||||
stories['tld'] = stories.url.apply(map_tld)
|
||||
raw_stories['tld'] = raw_stories.url.apply(map_tld)
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
@@ -167,5 +167,25 @@ def create_tables():
|
||||
JOIN mbfc.publishers p
|
||||
ON p.tld = s.tld
|
||||
""")
|
||||
with connect() as db:
|
||||
data = db.sql("""
|
||||
select
|
||||
id,
|
||||
reporting
|
||||
from mbfc.publishers p
|
||||
""").df()
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
alter table mbfc.publishers add column reporting_ordinal int
|
||||
""")
|
||||
|
||||
data['ordinal'] = data.reporting.apply(reporting_label_to_int)
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
update mbfc.publishers
|
||||
set reporting_ordinal = data.ordinal
|
||||
from data
|
||||
where data.id = publishers.id
|
||||
""")
|
||||
|
||||
@@ -22,6 +22,8 @@ def paths(name='app'):
|
||||
return Path(os.environ['DATA_MINING_DOCS_DIR'])
|
||||
if 'figure' in name:
|
||||
return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
|
||||
if 'model' in name:
|
||||
return Path(os.environ['DATA_MINING_DATA_DIR']) / 'models'
|
||||
|
||||
def connect():
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
@@ -105,3 +107,32 @@ def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
|
||||
except:
|
||||
print(f"no mapping for {class_id}", file=sys.stderr)
|
||||
return -1
|
||||
|
||||
def reporting_label_to_int(label):
|
||||
mapping = {
|
||||
'Very Low': 0,
|
||||
'Low': 1,
|
||||
'Mixed': -1,
|
||||
'Mostly Factual': 3,
|
||||
'High': 4,
|
||||
'Very High': 5
|
||||
}
|
||||
try:
|
||||
return mapping[label]
|
||||
except:
|
||||
return -1
|
||||
|
||||
def save_model(model, name):
|
||||
import pickle
|
||||
save_to = paths('models') / name
|
||||
with open(save_to, 'wb') as file:
|
||||
pickle.dump(model, file)
|
||||
print(f"saved model: {save_to}")
|
||||
|
||||
def load_model(name):
|
||||
import pickle
|
||||
open_from = paths('models') / name
|
||||
print(f"loading model: {open_from}")
|
||||
with open(open_from, 'rb') as file:
|
||||
model = pickle.load(file)
|
||||
return model
|
||||
|
||||
287
src/data/sentence.py
Normal file
287
src/data/sentence.py
Normal file
@@ -0,0 +1,287 @@
|
||||
import click
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from data.main import connect, paths, save_model, load_model, ticklabels
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
#Mean Pooling - Take attention mask into account for correct averaging
|
||||
def mean_pooling(model_output, attention_mask):
|
||||
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
||||
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
||||
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
||||
|
||||
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
|
||||
@click.command("sentence:embed")
|
||||
def embed(chunks):
|
||||
|
||||
# Load model from HuggingFace Hub
|
||||
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
||||
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
||||
|
||||
# load data
|
||||
with connect() as db:
|
||||
table = db.sql("""
|
||||
select
|
||||
id
|
||||
,title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
|
||||
# normalize text
|
||||
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
||||
|
||||
|
||||
chunked = np.array_split(table, chunks)
|
||||
# generate embeddings from list of titles
|
||||
iterator = tqdm(chunked, 'embedding')
|
||||
embeddings = []
|
||||
embedding_ids = []
|
||||
for _, chunk in enumerate(iterator):
|
||||
sentences = chunk['title'].tolist()
|
||||
ids = chunk['id'].tolist()
|
||||
# Tokenize sentences
|
||||
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
||||
# Compute token embeddings
|
||||
with torch.no_grad():
|
||||
model_output = model(**encoded_input)
|
||||
# Perform pooling
|
||||
output = mean_pooling(model_output, encoded_input['attention_mask'])
|
||||
# Normalize embeddings
|
||||
output = F.normalize(output, p=2, dim=1)
|
||||
embeddings.append(output)
|
||||
embedding_ids.append(ids)
|
||||
|
||||
embeddings = np.concatenate(embeddings)
|
||||
ids = np.concatenate(embedding_ids)
|
||||
|
||||
# save embeddings
|
||||
save_to = paths('data') / 'embeddings.npy'
|
||||
np.save(save_to, embeddings)
|
||||
print(f"embeddings saved: {save_to}")
|
||||
|
||||
# save ids
|
||||
save_to = paths('data') / 'embedding_ids.npy'
|
||||
np.save(save_to, ids)
|
||||
print(f"ids saved: {save_to}")
|
||||
|
||||
|
||||
@click.command('sentence:create-avg-pca-table')
|
||||
def create_avg_pca_table():
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
|
||||
embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||
|
||||
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
,s.publisher_id
|
||||
,b.ordinal
|
||||
FROM ids
|
||||
JOIN top.stories s
|
||||
ON ids.story_id = s.id
|
||||
JOIN top.publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
|
||||
results = []
|
||||
for publisher_id, group in data.groupby(['publisher_id']):
|
||||
avg = embeddings[group['index']].mean(axis=0)
|
||||
ordinal = group['ordinal'].iloc[0]
|
||||
results.append({'publisher_id' : publisher_id, 'embedding' : avg, 'ordinal' : ordinal})
|
||||
results = pd.DataFrame(results)
|
||||
|
||||
x = np.stack(results['embedding'])
|
||||
y = results['ordinal']
|
||||
|
||||
model = PCA(n_components=2)
|
||||
pred = model.fit_transform(x)
|
||||
results['first'] = pred[:, 0]
|
||||
results['second'] = pred[:, 1]
|
||||
|
||||
table_name = "top.publisher_embeddings_pca"
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
results.publisher_id as publisher_id
|
||||
,results.first as first
|
||||
,results.second as second
|
||||
FROM results
|
||||
""")
|
||||
|
||||
print(f"created {table_name}")
|
||||
|
||||
|
||||
@click.command('sentence:create-pca-table')
|
||||
def create_pca_table():
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
embeddings = np.load(path('data') / 'embeddings.npy')
|
||||
embedding_ids = np.load(path('data') / 'embedding_ids.npy')
|
||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
,p.bias
|
||||
,p.ordinal
|
||||
FROM ids
|
||||
JOIN stories s
|
||||
ON ids.story_id = s.id
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON s.id = ps.story_id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
WHERE p.ordinal != -1
|
||||
""").df()
|
||||
pub = db.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM mbfc.publishers
|
||||
""").df()
|
||||
|
||||
x = embeddings[data['index']]
|
||||
y = data['ordinal'].to_numpy().reshape(-1, 1)
|
||||
model = PCA(n_components=2)
|
||||
pred = model.fit_transform(x)
|
||||
data['first'] = pred[:, 0]
|
||||
data['second'] = pred[:, 1]
|
||||
|
||||
table_name = f"story_embeddings_pca"
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
data.id as story_id
|
||||
,data.first as first
|
||||
,data.second as second
|
||||
FROM data
|
||||
""")
|
||||
print(f"created {table_name}")
|
||||
|
||||
@click.command('sentence:create-svm-table')
|
||||
def create_svm_table():
|
||||
"""sentence to classifier"""
|
||||
|
||||
from sklearn import svm
|
||||
from sklearn.linear_model import SGDClassifier
|
||||
|
||||
embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
,p.ordinal
|
||||
,p.bias
|
||||
FROM ids
|
||||
JOIN stories s
|
||||
ON ids.story_id = s.id
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON s.id = ps.story_id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
WHERE p.ordinal != -1
|
||||
""").df()
|
||||
|
||||
x = embeddings[data['index']]
|
||||
y = data['ordinal']
|
||||
|
||||
model = SGDClassifier()
|
||||
model = model.fit(x, y)
|
||||
# data['pred'] = pred.predict(x)
|
||||
save_model(model, 'sgdclassifier.pkl')
|
||||
|
||||
def interence():
|
||||
|
||||
with connect() as db:
|
||||
bias = db.query("""
|
||||
SELECT
|
||||
p.bias
|
||||
,p.ordinal
|
||||
FROM mbfc.publishers p
|
||||
WHERE p.ordinal != -1
|
||||
GROUP BY
|
||||
p.bias
|
||||
,p.ordinal
|
||||
ORDER BY
|
||||
p.ordinal
|
||||
""").df()
|
||||
|
||||
sdg = load_model( 'sgdclassifier.pkl')
|
||||
|
||||
|
||||
tokens = tokenizer(["hello, i hate woke culture.", "trump is winning"], padding=True, truncation=True, return_tensors='pt')
|
||||
|
||||
with torch.no_grad():
|
||||
output = model(**tokens)
|
||||
|
||||
output = mean_pooling(output, tokens['attention_mask'])
|
||||
|
||||
output = F.normalize(output, p=2, dim=1)
|
||||
sdg.predict(output)
|
||||
|
||||
tokens
|
||||
dir(output)
|
||||
|
||||
def validation():
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
,p.ordinal
|
||||
,p.bias
|
||||
FROM ids
|
||||
JOIN stories s
|
||||
ON ids.story_id = s.id
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON s.id = ps.story_id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
WHERE p.ordinal != -1
|
||||
""").df()
|
||||
|
||||
x = embeddings[data['index']]
|
||||
y = data['ordinal']
|
||||
|
||||
|
||||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
|
||||
|
||||
clf = LinearSVC()
|
||||
clf.fit(x_train, y_train)
|
||||
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 5))
|
||||
ConfusionMatrixDisplay.from_predictions(y_test, clf.predict(x_test), ax=ax)
|
||||
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels(), yticklabels=ticklabels())
|
||||
plt.show()
|
||||
|
||||
plt.savefig(save_to)
|
||||
@@ -20,15 +20,14 @@ def extract(chunks):
|
||||
|
||||
|
||||
# load data
|
||||
DB = connect()
|
||||
table = DB.sql("""
|
||||
select
|
||||
id
|
||||
,title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
DB.close()
|
||||
with connect() as db:
|
||||
table = db.sql("""
|
||||
select
|
||||
id
|
||||
,title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
|
||||
# normalize text
|
||||
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
||||
@@ -56,12 +55,12 @@ def extract(chunks):
|
||||
story_ids = np.concatenate(story_ids)
|
||||
|
||||
# save embeddings
|
||||
save_to = data_dir() / 'sentiment.npy'
|
||||
save_to = paths('data') / 'sentiment.npy'
|
||||
np.save(save_to, sentiments)
|
||||
print(f"sentiments saved: {save_to}")
|
||||
|
||||
# save ids
|
||||
save_to = data_dir() / 'sentiment_ids.npy'
|
||||
save_to = paths('data') / 'sentiment_ids.npy'
|
||||
np.save(save_to, story_ids)
|
||||
print(f"ids saved: {save_to}")
|
||||
|
||||
|
||||
93
src/data/word.py
Normal file
93
src/data/word.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import click
|
||||
from transformers import AutoTokenizer, RobertaModel
|
||||
import numpy as np
|
||||
from data.main import connect, paths
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
@click.command(name="word:max-sequence")
|
||||
def max_sequence():
|
||||
"""calculate the maximum token length given the story titles"""
|
||||
with connect() as db:
|
||||
longest = db.sql("""
|
||||
select
|
||||
title
|
||||
from stories
|
||||
order by length(title) desc
|
||||
limit 5000
|
||||
""").df()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||
tokens = tokenizer(longest['title'].to_list())
|
||||
print(f"{max([len(x) for x in tokens['input_ids']])}")
|
||||
|
||||
@click.command(name="word:embed")
|
||||
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
|
||||
@click.option('--embedding_dir', help="path to save embeddings as np array", type=Path, default=Path(paths('data') / 'embeddings'), show_default=True)
|
||||
@click.option('--token_dir', help="path to save tokens as np array", type=Path, default=Path(paths('data') / 'tokens'), show_default=True)
|
||||
@click.option('--device', help="device to process data on", type=str, default="cuda:0", show_default=True)
|
||||
def embed(chunks, embedding_dir, token_dir, device):
|
||||
""" given titles, generate tokens and word embeddings and saves to disk """
|
||||
|
||||
# init models
|
||||
device = torch.device(device)
|
||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||
model = RobertaModel.from_pretrained("roberta-base")
|
||||
model.to(device)
|
||||
|
||||
# load data
|
||||
with connect() as db:
|
||||
table = db.sql("""
|
||||
select
|
||||
title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
|
||||
# normalize text
|
||||
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
||||
|
||||
# generate embeddings from list of titles
|
||||
chunks = np.array_split(table['title'].to_numpy(), chunks)
|
||||
chunk_iter = tqdm(chunks, 'embedding')
|
||||
for i, chunk in enumerate(chunk_iter):
|
||||
# create tokens, padding to max width
|
||||
tokens = tokenizer(chunk.tolist(), add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
|
||||
tokens = tokens.to(device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**tokens)
|
||||
|
||||
# to disk
|
||||
hidden = outputs.last_hidden_state.to(torch.device('cpu')).detach().numpy()
|
||||
np.save(embedding_dir / f"embedding_{i}.npy", hidden)
|
||||
|
||||
tokens = tokens.to(torch.device('cpu'))
|
||||
np.save(token_dir / f"token_{i}.npy", tokens)
|
||||
|
||||
@click.command(name="word:distance")
|
||||
def distance():
|
||||
"""TODO: measure distance between sequence embeddings"""
|
||||
distances = distance.cdist(classes, classes, 'euclidean')
|
||||
np.fill_diagonal(distances, np.inf)
|
||||
min_index = (np.argmin(distances))
|
||||
closest = np.unravel_index(min_index, distances.shape)
|
||||
distances.flatten().shape
|
||||
|
||||
# path = paths('data') / 'embeddings'
|
||||
# chunks = [x for x in path.iterdir() if x.match('*.npy')]
|
||||
# chunks = sorted(chunks, key=lambda x: int(x.stem.split('_')[1]))
|
||||
#
|
||||
# data = None
|
||||
# for i, f in enumerate(tqdm(chunks)):
|
||||
# loaded = np.load(f)
|
||||
# if data is None:
|
||||
# data = loaded
|
||||
# else:
|
||||
# data = np.concatenate([data, loaded])
|
||||
# if i > 20:
|
||||
# break
|
||||
#
|
||||
# data.shape
|
||||
#
|
||||
# np.save(data, paths('data') / 'embeddings.npy')
|
||||
Reference in New Issue
Block a user