rough draft of paper done.

This commit is contained in:
matt
2023-06-07 20:44:48 -07:00
parent 245f60a7a8
commit 7edb8543a7
38 changed files with 1130 additions and 388 deletions

View File

@@ -2,70 +2,85 @@ import click
from dotenv import load_dotenv
import data
import plots
import mining
import train
@click.group()
def cli():
...
@cli.group(name="data")
def data_subcommand():
"""data subcommands"""
...
@cli.group(name="mining")
def mining_subcommand():
"""mining subcommands"""
...
@cli.group(name="plot")
def plot_subcommand():
"""plotting subcommands"""
...
@cli.group(name="train")
def train_subcommand():
"""train subcommands"""
...
if __name__ == "__main__":
load_dotenv()
# original bias ratings
cli.add_command(data.scrape.download)
cli.add_command(data.scrape.parse)
cli.add_command(data.scrape.load)
cli.add_command(data.scrape.normalize)
cli.add_command(data.scrape.create_elections_table)
data_subcommand.add_command(data.scrape.download)
data_subcommand.add_command(data.scrape.parse)
data_subcommand.add_command(data.scrape.load)
data_subcommand.add_command(data.scrape.normalize)
data_subcommand.add_command(data.scrape.create_elections_table)
cli.add_command(data.factcheck.parse_index)
cli.add_command(data.factcheck.scrape)
data_subcommand.add_command(data.factcheck.parse_index)
data_subcommand.add_command(data.factcheck.scrape)
cli.add_command(data.links.create_table)
cli.add_command(data.links.create_pca)
cli.add_command(data.links.create_clusters)
data_subcommand.add_command(data.links.create_table)
data_subcommand.add_command(data.links.create_pca)
data_subcommand.add_command(data.links.create_clusters)
import word
# cli.add_command(word.distance)
# cli.add_command(word.train)
cli.add_command(word.embed)
cli.add_command(word.max_sequence)
import bias
cli.add_command(bias.parse)
cli.add_command(bias.load)
cli.add_command(bias.normalize)
data_subcommand.add_command(data.bias.parse)
data_subcommand.add_command(data.bias.load)
data_subcommand.add_command(data.bias.normalize)
import mine
cli.add_command(mine.embeddings)
cli.add_command(mine.cluster)
cli.add_command(mine.plot)
data_subcommand.add_command(data.emotion.extract)
data_subcommand.add_command(data.emotion.normalize)
data_subcommand.add_command(data.emotion.analyze)
data_subcommand.add_command(data.emotion.create_table)
import emotion
cli.add_command(emotion.extract)
cli.add_command(emotion.normalize)
cli.add_command(emotion.analyze)
cli.add_command(emotion.create_table)
data_subcommand.add_command(data.word.embed)
data_subcommand.add_command(data.word.max_sequence)
data_subcommand.add_command(data.sentence.embed)
data_subcommand.add_command(data.sentence.create_avg_pca_table)
import sentence
cli.add_command(sentence.embed)
cli.add_command(sentence.create_avg_pca_table)
mining_subcommand.add_command(mining.main.embeddings)
mining_subcommand.add_command(mining.main.cluster)
mining_subcommand.add_command(mining.main.plot)
from train import main as train_main
cli.add_command(train_main.main)
plot_subcommand.add_command(plots.descriptive.articles_per_year)
plot_subcommand.add_command(plots.descriptive.distinct_publishers)
plot_subcommand.add_command(plots.descriptive.stories_per_publisher)
plot_subcommand.add_command(plots.descriptive.top_publishers)
plot_subcommand.add_command(plots.descriptive.common_tld)
plot_subcommand.add_command(plots.sentence.sentence_pca)
plot_subcommand.add_command(plots.sentence.avg_sentence_pca)
plot_subcommand.add_command(plots.emotion.emotion_over_time)
plot_subcommand.add_command(plots.emotion.emotion_regression)
plot_subcommand.add_command(plots.sentiment.over_time)
plot_subcommand.add_command(plots.sentiment.bias_over_time)
plot_subcommand.add_command(plots.sentiment.bias_vs_recent_winner)
plot_subcommand.add_command(plots.links.elbow)
plot_subcommand.add_command(plots.links.link_pca_clusters)
plot_subcommand.add_command(plots.classifier.pca_with_classes)
cli.add_command(plots.descriptive.articles_per_year)
cli.add_command(plots.descriptive.distinct_publishers)
cli.add_command(plots.descriptive.stories_per_publisher)
cli.add_command(plots.descriptive.top_publishers)
cli.add_command(plots.descriptive.common_tld)
cli.add_command(plots.sentence.sentence_pca)
cli.add_command(plots.sentence.avg_sentence_pca)
cli.add_command(plots.emotion.emotion_over_time)
cli.add_command(plots.emotion.emotion_regression)
cli.add_command(plots.sentiment.over_time)
cli.add_command(plots.sentiment.bias_over_time)
cli.add_command(plots.sentiment.bias_vs_recent_winner)
cli.add_command(plots.links.elbow)
cli.add_command(plots.links.link_pca_clusters)
cli.add_command(plots.classifier.pca_with_classes)
train_subcommand.add_command(train.main.main)
train_subcommand.add_command(train.main.validate)
cli()

View File

@@ -2,9 +2,24 @@ import data.main
import data.scrape
import data.factcheck
import data.links
import data.bias
import data.emotion
import data.broken_links
import data.selection
import data.sentence
import data.sentiment
import data.word
__all__ = [
'main'
,'scrape'
,'factcheck'
,'links'
,'bias'
,'emotion'
,'broken_links'
,'selection'
,'sentence'
,'sentiment'
,'word'
]

View File

@@ -3,23 +3,20 @@ import seaborn as sns
import matplotlib.pyplot as plt
import click
from data import connect
from data.main import connect
@click.command(name="broken:crawl")
def crawl():
"""crawl story urls checking for link rot or redirects."""
DB = connect()
urls = DB.query("""
select
id
,url
from stories
order by published_at asc
limit 5
""").fetchall()
DB.close()
with connect() as db:
urls = db.query("""
select
id
,url
from stories
order by published_at asc
limit 5
""").fetchall()
story_id, url = urls[1]
# url

View File

@@ -5,7 +5,7 @@ import pandas as pd
import numpy as np
from transformers import BertTokenizer
from model import BertForMultiLabelClassification
from train.model import BertForMultiLabelClassification
from data.main import connect, data_dir
import seaborn as sns
import matplotlib.pyplot as plt

View File

@@ -8,7 +8,7 @@ from pathlib import Path
import os
import sys
import click
from data.main import connect, map_tld, paths
from data.main import connect, map_tld, paths, reporting_label_to_int
from random import randint
from time import sleep
from tqdm import tqdm
@@ -155,7 +155,7 @@ def create_tables():
FROM stories s
""").df()
stories['tld'] = stories.url.apply(map_tld)
raw_stories['tld'] = raw_stories.url.apply(map_tld)
with connect() as db:
db.sql("""
@@ -167,5 +167,25 @@ def create_tables():
JOIN mbfc.publishers p
ON p.tld = s.tld
""")
with connect() as db:
data = db.sql("""
select
id,
reporting
from mbfc.publishers p
""").df()
with connect() as db:
db.sql("""
alter table mbfc.publishers add column reporting_ordinal int
""")
data['ordinal'] = data.reporting.apply(reporting_label_to_int)
with connect() as db:
db.sql("""
update mbfc.publishers
set reporting_ordinal = data.ordinal
from data
where data.id = publishers.id
""")

View File

@@ -22,6 +22,8 @@ def paths(name='app'):
return Path(os.environ['DATA_MINING_DOCS_DIR'])
if 'figure' in name:
return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
if 'model' in name:
return Path(os.environ['DATA_MINING_DATA_DIR']) / 'models'
def connect():
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
@@ -105,3 +107,32 @@ def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
except:
print(f"no mapping for {class_id}", file=sys.stderr)
return -1
def reporting_label_to_int(label):
mapping = {
'Very Low': 0,
'Low': 1,
'Mixed': -1,
'Mostly Factual': 3,
'High': 4,
'Very High': 5
}
try:
return mapping[label]
except:
return -1
def save_model(model, name):
import pickle
save_to = paths('models') / name
with open(save_to, 'wb') as file:
pickle.dump(model, file)
print(f"saved model: {save_to}")
def load_model(name):
import pickle
open_from = paths('models') / name
print(f"loading model: {open_from}")
with open(open_from, 'rb') as file:
model = pickle.load(file)
return model

View File

@@ -1,13 +1,11 @@
import click
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from data.main import connect, paths
import os
from pathlib import Path
from data.main import connect, paths, save_model, load_model, ticklabels
import numpy as np
import pandas as pd
from tqdm import tqdm
import click
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
@@ -24,15 +22,14 @@ def embed(chunks):
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# load data
DB = connect()
table = DB.sql("""
select
id
,title
from stories
order by id desc
""").df()
DB.close()
with connect() as db:
table = db.sql("""
select
id
,title
from stories
order by id desc
""").df()
# normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
@@ -67,7 +64,7 @@ def embed(chunks):
print(f"embeddings saved: {save_to}")
# save ids
save_to = data_dir() / 'embedding_ids.npy'
save_to = paths('data') / 'embedding_ids.npy'
np.save(save_to, ids)
print(f"ids saved: {save_to}")
@@ -133,25 +130,28 @@ def create_pca_table():
embeddings = np.load(path('data') / 'embeddings.npy')
embedding_ids = np.load(path('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,b.ordinal
,p.bias
,p.ordinal
FROM ids
JOIN top.stories s
JOIN stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
JOIN mbfc.publisher_stories ps
ON s.id = ps.story_id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
""").df()
pub = db.query("""
SELECT
*
FROM top.publishers
FROM mbfc.publishers
""").df()
x = embeddings[data['index']]
@@ -161,8 +161,7 @@ def create_pca_table():
data['first'] = pred[:, 0]
data['second'] = pred[:, 1]
table_name = f"top.story_embeddings_pca"
table_name = f"story_embeddings_pca"
with connect() as db:
db.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
@@ -172,11 +171,12 @@ def create_pca_table():
,data.second as second
FROM data
""")
print(f"created {table_name}")
@click.command('sentence:create-svm-table')
def create_svm_table():
"""sentence to classifier"""
from sklearn import svm
from sklearn.linear_model import SGDClassifier
@@ -189,22 +189,99 @@ def create_svm_table():
SELECT
ids.index
,s.id
,b.ordinal
,p.ordinal
,p.bias
FROM ids
JOIN top.stories s
JOIN stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
JOIN mbfc.publisher_stories ps
ON s.id = ps.story_id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
""").df()
x = embeddings[data['index']]
#y = data['ordinal'].to_numpy().reshape(-1, 1)
y = data['ordinal']
model = SGDClassifier()
pred = model.fit(x, y)
data['pred'] = pred.predict(x)
data
model = model.fit(x, y)
# data['pred'] = pred.predict(x)
save_model(model, 'sgdclassifier.pkl')
def interence():
with connect() as db:
bias = db.query("""
SELECT
p.bias
,p.ordinal
FROM mbfc.publishers p
WHERE p.ordinal != -1
GROUP BY
p.bias
,p.ordinal
ORDER BY
p.ordinal
""").df()
sdg = load_model( 'sgdclassifier.pkl')
tokens = tokenizer(["hello, i hate woke culture.", "trump is winning"], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
output = model(**tokens)
output = mean_pooling(output, tokens['attention_mask'])
output = F.normalize(output, p=2, dim=1)
sdg.predict(output)
tokens
dir(output)
def validation():
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
with connect() as db:
data = db.query("""
SELECT
ids.index
,s.id
,p.ordinal
,p.bias
FROM ids
JOIN stories s
ON ids.story_id = s.id
JOIN mbfc.publisher_stories ps
ON s.id = ps.story_id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
""").df()
x = embeddings[data['index']]
y = data['ordinal']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
clf = LinearSVC()
clf.fit(x_train, y_train)
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(y_test, clf.predict(x_test), ax=ax)
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels(), yticklabels=ticklabels())
plt.show()
plt.savefig(save_to)

View File

@@ -20,15 +20,14 @@ def extract(chunks):
# load data
DB = connect()
table = DB.sql("""
select
id
,title
from stories
order by id desc
""").df()
DB.close()
with connect() as db:
table = db.sql("""
select
id
,title
from stories
order by id desc
""").df()
# normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
@@ -56,12 +55,12 @@ def extract(chunks):
story_ids = np.concatenate(story_ids)
# save embeddings
save_to = data_dir() / 'sentiment.npy'
save_to = paths('data') / 'sentiment.npy'
np.save(save_to, sentiments)
print(f"sentiments saved: {save_to}")
# save ids
save_to = data_dir() / 'sentiment_ids.npy'
save_to = paths('data') / 'sentiment_ids.npy'
np.save(save_to, story_ids)
print(f"ids saved: {save_to}")

View File

@@ -1,7 +1,7 @@
import click
from transformers import AutoTokenizer, RobertaModel
import numpy as np
from data.main import Data, from_db, connect, data_dir
from data.main import connect, paths
from tqdm import tqdm
import torch
from pathlib import Path
@@ -9,30 +9,23 @@ from pathlib import Path
@click.command(name="word:max-sequence")
def max_sequence():
"""calculate the maximum token length given the story titles"""
db = connect()
longest = db.sql("""
select
title
from stories
order by length(title) desc
limit 5000
""").df()
db.close()
with connect() as db:
longest = db.sql("""
select
title
from stories
order by length(title) desc
limit 5000
""").df()
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokens = tokenizer(longest['title'].to_list())
print(f"{max([len(x) for x in tokens['input_ids']])}")
@click.command(name="word:train")
def train():
"""TODO"""
table = from_db(Data.Titles)
n_classes = 10
@click.command(name="word:embed")
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
@click.option('--embedding_dir', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'embeddings'), show_default=True)
@click.option('--token_dir', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'tokens'), show_default=True)
@click.option('--embedding_dir', help="path to save embeddings as np array", type=Path, default=Path(paths('data') / 'embeddings'), show_default=True)
@click.option('--token_dir', help="path to save tokens as np array", type=Path, default=Path(paths('data') / 'tokens'), show_default=True)
@click.option('--device', help="device to process data on", type=str, default="cuda:0", show_default=True)
def embed(chunks, embedding_dir, token_dir, device):
""" given titles, generate tokens and word embeddings and saves to disk """
@@ -44,14 +37,13 @@ def embed(chunks, embedding_dir, token_dir, device):
model.to(device)
# load data
db = connect()
table = db.sql("""
select
title
from stories
order by id desc
""").df()
db.close()
with connect() as db:
table = db.sql("""
select
title
from stories
order by id desc
""").df()
# normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
@@ -82,7 +74,7 @@ def distance():
closest = np.unravel_index(min_index, distances.shape)
distances.flatten().shape
# path = data_dir() / 'embeddings'
# path = paths('data') / 'embeddings'
# chunks = [x for x in path.iterdir() if x.match('*.npy')]
# chunks = sorted(chunks, key=lambda x: int(x.stem.split('_')[1]))
#
@@ -98,4 +90,4 @@ def distance():
#
# data.shape
#
# np.save(data, data_dir() / 'embeddings.npy')
# np.save(data, paths('data') / 'embeddings.npy')

9
src/mining/__init__.py Normal file
View File

@@ -0,0 +1,9 @@
import mining.main
import mining.apriori
import mining.bias
__all__ = [
'main'
,'apriori'
,'bias'
]

View File

@@ -1,3 +1,5 @@
import click
from efficient_apriori import apriori
from data.main import connect

View File

@@ -15,49 +15,3 @@ class Model(nn.Module):
outs = self.act(self.linear(outs.last_hidden_state))
return outs
import torch.nn as nn
from transformers import BertPreTrainedModel, BertModel
class BertForMultiLabelClassification(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.loss_fct = nn.BCEWithLogitsLoss()
self.init_weights()
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss = self.loss_fct(logits, labels)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)

View File

@@ -1,5 +0,0 @@
import pandas as pd
import math
df = pd.read_csv('/tmp/attr.csv')
((((df.left - 9.1) ** 2) + ((df.right - 11.0) ** 2)) ** 0.5).sort_values()

View File

@@ -3,6 +3,7 @@ import plots.emotion
import plots.sentiment
import plots.links
import plots.classifier
import plots.descriptive
__all__ = [
'sentence'
@@ -10,4 +11,5 @@ __all__ = [
'sentiment',
'links',
'classifier',
'descriptive',
]

View File

@@ -5,7 +5,7 @@ import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
@click.command('plot:pca-with-classes')
@click.command('classifier:pca-with-classes')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def pca_with_classes(source):

View File

@@ -6,7 +6,7 @@ import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
@click.command('plot:articles-per-year')
@click.command('descriptive:articles-per-year')
def articles_per_year():
save_to = paths('figures') / 'articles_per_year.png'
@@ -27,29 +27,34 @@ def articles_per_year():
plt.savefig(save_to)
print(f"saved: {save_to}")
@click.command('plot:distinct-publishers')
@click.command('descriptive:distinct-publishers')
def distinct_publishers():
save_to = paths('figures') / 'distinct_publishers.png'
with connect() as db:
data = DB.query("""
data = db.query("""
select
year(published_at) as year
,count(distinct publisher_id) as publishers
from stories
group by
year(published_at)
count(distinct p.id) as publishers
,date_trunc('year', s.published_at) as date
from stories s
join mbfc.publisher_stories ps
on s.id = ps.story_id
join mbfc.publishers p
on ps.publisher_id = p.id
and year(s.published_at) not in (2005, 2023)
group by
date_trunc('year', s.published_at)
""").df()
ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue')
ax.tick_params(axis='x', rotation=90)
ax.set(title="count of publishers per year", ylabel="count of publishers (#)")
ax = sns.barplot(x=data.date.dt.year, y=data.publishers, color='tab:blue')
ax.tick_params(axis='x', rotation=45)
ax.set(ylabel="count of publishers (#)", xlabel="year")
plt.tight_layout()
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:stories-per-publisher')
@click.command('descriptive:stories-per-publisher')
def stories_per_publisher():
save_to = paths('figures') / 'stories_per_publisher.png'
@@ -100,7 +105,7 @@ def stories_per_publisher():
print(f"saved: {save_to}")
@click.command('plot:top-publishers')
@click.command('descriptive:top-publishers')
def top_publishers():
"""plot top publishers over time"""
@@ -164,7 +169,7 @@ def top_publishers():
print(f"saved: {save_to}")
@click.command('plot:common_tld')
@click.command('descriptive:common_tld')
def common_tld():
import dataframe_image as dfi
save_to = paths('figures') / 'common_tld.png'
@@ -189,42 +194,71 @@ def common_tld():
def stats():
# raw
DB.query("""
SELECT
'total stories' as key
,COUNT(1) as value
FROM stories
UNION
SELECT
'total related' as key
,COUNT(1) as value
FROM related_stories
UNION
SELECT
'top level domains' as key
,COUNT(distinct tld) as value
FROM stories
UNION
SELECT
'publishers' as key
,COUNT(1) as value
FROM publishers
UNION
SELECT
'authors' as key
,COUNT(distinct author) as value
FROM stories
UNION
SELECT
'min year' as key
,min(year(published_at)) as value
FROM stories
UNION
SELECT
'max year' as key
,max(year(published_at)) as value
FROM stories
""").df().to_markdown(index=False)
with connect() as db:
db.query("""
SELECT
'total stories' as key
,COUNT(1) as value
FROM stories
UNION
SELECT
'total related' as key
,COUNT(1) as value
FROM related_stories
UNION
SELECT
'top level domains' as key
,COUNT(distinct tld) as value
FROM stories
UNION
SELECT
'publishers' as key
,COUNT(1) as value
FROM mbfc.publishers
UNION
SELECT
'authors' as key
,COUNT(distinct author) as value
FROM stories
UNION
SELECT
'years' as key
,min(year(published_at)) || '-' || min(year(published_at)) as value
FROM stories
UNION
SELECT
'max year' as key
,max(year(published_at)) as value
FROM stories
UNION
SELECT
'publishers with ratings' as key
,count(distinct ps.publisher_id)
FROM mbfc.publisher_stories ps
UNION
SELECT
'publishers without ratings' as key
,count(distinct s.publisher_id)
from stories s
left join mbfc.publisher_stories ps
on ps.story_id = s.id
where ps.publisher_id is null
UNION
SELECT
'stories with ratings' as key
,count(distinct ps.story_id)
FROM mbfc.publisher_stories ps
UNION
SELECT
'stories without ratings' as key
,count(distinct s.id)
from stories s
left join mbfc.publisher_stories ps
on ps.story_id = s.id
where ps.publisher_id is null
""")
#.df().to_markdown(index=False)
# selected
DB.query("""
@@ -264,7 +298,7 @@ def stats():
FROM top.stories
""").df().to_markdown(index=False)
@click.command('plot:bias-stats')
@click.command('descriptive:bias-stats')
def bias_stats():
import dataframe_image as dfi
save_to = paths('figures') / 'bias_stats.png'
@@ -322,7 +356,7 @@ def bias_stats():
DB.close()
print(df.to_markdown(index=False))
@click.command('plot:bias-over-time')
@click.command('descriptive:bias-over-time')
def bias_over_time():
"""plot bias labels over time"""

View File

@@ -6,7 +6,7 @@ import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
@click.command('plot:emotion-over-time')
@click.command('emotion:over-time')
def emotion_over_time():
filename = "emotion_over_time.png"
@@ -35,7 +35,7 @@ def emotion_over_time():
print(f"saved: {save_to}")
os.system(f'xdg-open {save_to}')
@click.command('plot:emotion-regression')
@click.command('emotion:regression')
def emotion_regression():
"""plot emotion over time as regression"""
@@ -114,7 +114,7 @@ def emotion_regression():
plt.close()
print(f"saved: {save_to}")
@click.command('plot:emotion-hist')
@click.command('emotion:hist')
def emotion_hist():
filename = "emotion_hist.png"

View File

@@ -1,16 +1,13 @@
import click
from data.main import connect
from links import to_matrix
import os
from data.main import connect, ticklabels, paths
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
from sklearn.metrics import silhouette_score
import pandas as pd
@click.command('plot:link-elbow')
@click.command('links:elbow')
def elbow():
from sklearn.cluster import KMeans
@@ -42,7 +39,7 @@ def elbow():
# randomly pick 8
@click.command('plot:link-pca-clusters')
@click.command('links:pca-clusters')
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
def link_pca_clusters(source):
@@ -57,20 +54,22 @@ def link_pca_clusters(source):
,pca.first
,pca.second
,s.cnt as stories
FROM top.publisher_clusters_{source} c
JOIN top.publishers p
ON c.publisher_id = p.id
FROM publisher_clusters_{source} c
JOIN mbfc.publisher_stories ps
ON ps.publisher_id = c.publisher_id
JOIN mbfc.publishers p
ON ps.publisher_id = p.id
JOIN
(
select
s.publisher_id
p.id as publisher_id
,count(1) as cnt
FROM top.stories s
FROM mbfc.publishers p
GROUP BY
s.publisher_id
p.id
) s
ON s.publisher_id = p.id
JOIN top.publisher_pca_{source} pca
JOIN publisher_pca_{source} pca
ON pca.publisher_id = p.id
""").df()
@@ -107,7 +106,7 @@ def test():
""")
@click.command('plot:link-confusion')
@click.command('links:confusion')
def link_confusion():
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
@@ -166,7 +165,7 @@ def link_confusion():
plt.close()
print(f"saved plot: {save_to}")
@click.command('plot:link-classifier')
@click.command('links:classifier')
def link_confusion():
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
@@ -178,18 +177,16 @@ def link_confusion():
bias = db.query("""
SELECT
p.id as publisher_id
,b.ordinal
FROM top.publishers p
JOIN top.publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
,p.ordinal
FROM mbfc.publishers p
where ordinal != -1
""").df()
with connect() as db:
df = db.query("""
SELECT
*
FROM top.link_edges
FROM link_edges
WHERE parent_id in (
select
publisher_id
@@ -203,36 +200,22 @@ def link_confusion():
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
x = pivot.values
y = bias.sort_values('publisher_id').ordinal
with connect() as db:
data = db.query(f"""
SELECT
p.id as publisher_id
,pca.first
,pca.second
FROM top.publisher_pca_onehot pca
JOIN top.publishers p
ON pca.publisher_id = p.id
""").df()
publisher_matrix = pd.merge(pivot, bias, left_on='parent_id', right_on='publisher_id')
x = publisher_matrix.loc[:, ~publisher_matrix.columns.isin(['publisher_id', 'ordinal'])].values
y = publisher_matrix['ordinal']
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x, y)
y_pred = model.predict(x)
plot = bias.sort_values('publisher_id')
plot['pred'] = y_pred
data = pd.merge(plot, data)
publisher_matrix['pred'] = y_pred
publisher_matrix
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
fig, ax = plt.subplots(figsize=(5, 5))
ConfusionMatrixDisplay.from_predictions(publisher_matrix['ordinal'], publisher_matrix['pred'], ax=ax)
ax.set(xticklabels=ticklabels(), yticklabels=ticklabels())
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(save_to)
plt.close()
print(f"saved plot: {save_to}")

View File

@@ -7,7 +7,7 @@ import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
@click.command('plot:sentence-pca')
@click.command('sentence:pca')
def sentence_pca():
save_to = paths('figures') / "embedding_sentence_pca.png"
@@ -30,7 +30,7 @@ def sentence_pca():
ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
plt.savefig(save_to)
@click.command('plot:avg-sentence-pca')
@click.command('sentence:avg-pca')
def avg_sentence_pca():
save_to = paths('figures') / "avg_embedding_sentence_pca.png"
@@ -54,7 +54,7 @@ def avg_sentence_pca():
ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
plt.savefig(save_to)
@click.command('plot:sentence-confusion')
@click.command('sentence:confusion')
def sentence_confusion():
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

View File

@@ -3,7 +3,7 @@ from data.main import connect, paths, ticklabels
import seaborn as sns
import matplotlib.pyplot as plt
@click.command('plot:sentiment-over-time')
@click.command('sentiment:over-time')
def over_time():
filename = "sentiment_over_time.png"
@@ -28,7 +28,74 @@ def over_time():
plt.close()
print(f"saved: {save_to}")
@click.command('plot:bias-vs-sentiment-over-time')
@click.command('sentiment:bias-over-time')
def bias_over_time():
"""plot sentiment/bias vs. time"""
filename = "publisher_avg_sentiment_vs_bias_over_time.png"
save_to = paths('figures') / filename
with connect() as db:
data = db.sql("""
with cte as (
SELECT
avg(sent.class_id) as sentiment
,date_trunc('yearweek', s.published_at) as date
,p.id
,p.bias
FROM story_sentiments sent
JOIN stories s
ON s.id = sent.story_id
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON p.id = ps.publisher_id
WHERE p.ordinal != -1
and year(date) not in (2005, 2023)
GROUP BY
date_trunc('yearweek', s.published_at)
,p.id
,p.bias
) ,b as (
select
avg(sentiment) as sentiment
,median(sentiment) as median_sentiment
,bias
,date
from cte
group by
bias
,date
)
select
median(sentiment) OVER (PARTITION BY bias ORDER BY date DESC ROWS BETWEEN 0 PRECEDING AND 7 FOLLOWING) as sentiment
,bias
,date
from b
""").df()
ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels())
plt.axhline(y=0.5, color='black', linestyle='--', label='neutral')
ax.set(ylabel='8 week rolling avg. sentiment', xlabel='date', ylim=[0,1])
plt.tight_layout()
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
# from scipy.stats import pearsonr
# pivot = data.pivot(index=['date'], columns=['bias'], values='sentiment')
#
#
# for left in pivot.keys():
# for right in pivot.keys():
# if left != right:
# result = pearsonr(pivot[left], pivot[right])
# print(f"{left:<15}/{right:<15} | p: {result.pvalue:.2e} | coef: {result.statistic:.3f}")
#
# pivot
@click.command('sentiment:bias-over-time')
def bias_over_time():
"""plot sentiment/bias vs. time"""
@@ -62,16 +129,15 @@ def bias_over_time():
WHERE year(date) not in (2005, 2023)
""").df()
#ax = sns.relplot(data, x='date', y='sentiment', col='bias', palette='rainbow', hue='bias', col_order=ticklabels())
ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels())
plt.axhline(y=0.5, color='black', linestyle='--', label='neutral')
ax.set(title='sentiment and bias vs. time', ylabel='8 week rolling avg. sentiment', xlabel='date')
ax.set(ylabel='8 week rolling avg. sentiment', xlabel='date', ylim=[0,1])
plt.tight_layout()
plt.savefig(save_to)
plt.close()
print(f"saved: {save_to}")
@click.command('plot:sentiment-recent-winner')
@click.command('sentiment:recent-winner')
def bias_vs_recent_winner():
"""plot bias vs. distance to election"""
@@ -106,7 +172,7 @@ def bias_vs_recent_winner():
plt.close()
print(f"saved: {save_to}")
@click.command('plot:sentiment-hist')
@click.command('sentiment:hist')
def sentiment_hist():
filename = "sentiment_hist.png"

View File

@@ -1,5 +1,7 @@
import train.main
import train.model
__all__ = [
'main'
,'model'
]

View File

@@ -1,38 +1,104 @@
from torch.utils.data import Dataset
from data.main import connect, data_dir
from bias import label_to_int
from data.main import connect, paths
import numpy as np
import pandas as pd
import os
class NewsDataset(Dataset):
def __init__(self):
self.embeddings = np.load(data_dir() / 'embeddings.npy')
embedding_ids = pd.DataFrame(np.load(data_dir() / 'embedding_ids.npy'), columns=['id']).reset_index()
self.embeddings = np.load(paths('data') / 'embeddings.npy')
self.embedding_ids = pd.DataFrame(np.load(paths('data') / 'embedding_ids.npy'), columns=['id']).reset_index()
DB = connect()
query = """
SELECT
s.id
,b.label
,count(1) over (partition by publisher_id) as stories
FROM stories s
JOIN publisher_bias b
ON b.id = s.publisher_id
WHERE b.label != 'allsides'
"""
data = DB.sql(query).df()
DB.close()
data['label'] = data['label'].apply(lambda x: label_to_int(x))
data = data.merge(embedding_ids)
self.data = data
with connect() as db:
self.data = db.sql("""
WITH cte AS (
SELECT
s.id
,p.ordinal
,date_part('epoch', s.published_at) as epoch
,count(1) over(partition by p.id) as publisher_stories
,row_number() over(partition by p.ordinal) as label_row
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON ps.publisher_id = p.id
WHERE p.ordinal != -1
)
SELECT
id
,epoch
,publisher_stories
,ordinal
FROM cte
WHERE label_row < 40000
""").df()
self.data = self.data.merge(self.embedding_ids)
self.data['epoch_norm'] = (self.data['epoch'] - self.data['epoch'].min())/(self.data['epoch'].max()-self.data['epoch'].min())
self.data['publisher_stories_norm'] = (self.data['publisher_stories'] - self.data['publisher_stories'].min())/(self.data['publisher_stories'].max()-self.data['publisher_stories'].min())
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
row = self.data.iloc[idx]
y = row['label']
# x = np.concatenate((self.embeddings[row['index']], [row['stories']])).astype(np.float32)
x = self.embeddings[row['index']]
y = int(row['ordinal'])
x = self.embeddings[int(row['index'])]
x = np.append(x, row[['epoch_norm', 'publisher_stories_norm']].values).astype(np.float32)
return x, y
def normalized_epoch(self, idx):
epoch = self.data['epoch']
return (epoch.iloc[idx]-epoch.min())/(epoch.max()-epoch.min())
def normalized_stories(self, idx):
count = self.data['publisher_stories']
return (count.iloc[idx]-count.min())/(count.max()-count.min())
def get_in_out_size(self):
return int(os.getenv('EMBEDDING_LENGTH', 384)), int(os.getenv('CLASSES', 5)),
class PublisherDataset(Dataset):
def __init__(self):
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = pd.DataFrame(np.load(paths('data') / 'embedding_ids.npy'), columns=['id']).reset_index()
with connect() as db:
data = db.sql("""
WITH cte AS (
SELECT
s.id
,p.id as publisher_id
,p.ordinal
,row_number() over(partition by p.ordinal) as label_row
FROM stories s
JOIN mbfc.publisher_stories ps
ON ps.story_id = s.id
JOIN mbfc.publishers p
ON ps.publisher_id = p.id
WHERE p.ordinal != -1
)
SELECT
id
,ordinal
,publisher_id
FROM cte
WHERE label_row < 40000
""").df()
data = data.merge(self.embedding_ids)
self.x = []
self.y = []
for (publisher_id, ordinal), group in data.groupby(['publisher_id', 'ordinal'])[['ordinal', 'index']]:
self.x.append(embeddings[group['index']].mean(axis=0))
self.y.append(ordinal)
def __len__(self):
return len(self.x)
def __getitem__(self, idx):
return self.x[idx], self.y[idx]
def get_in_out_size(self):
return int(os.getenv('EMBEDDING_LENGTH', 384)), int(os.getenv('CLASSES', 5)),

View File

@@ -5,34 +5,32 @@ from dotenv import load_dotenv
import os
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from accelerate import Accelerator
from train.dataset import NewsDataset
from train.model import Classifier
#from model.linear import LinearClassifier
from data.main import paths, connect, ticklabels
import numpy as np
import pandas as pd
class Stage(Enum):
TRAIN = auto()
DEV = auto()
@click.command('train:main')
def main():
dev_after = 20
@click.command('main')
@click.option('--epochs', default=10, type=int)
def main(epochs):
dev_after = 5
visible_devices = None
lr = 1e-4
epochs = 10
debug = False
torch.manual_seed(0)
num_workers = 0
num_workers = int(os.getenv('NUMBER_OF_WORKERS', 0))
embedding_length = int(os.getenv('EMBEDDING_LENGTH', 384))
dataset = NewsDataset()
trainset, devset = torch.utils.data.random_split(dataset, [0.8, 0.2])
batch_size = 512
batch_size = int(os.getenv('BATCH_SIZE', 512))
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True)
devloader = DataLoader(devset, shuffle=False, num_workers=num_workers)
accelerator = Accelerator()
@@ -46,7 +44,7 @@ def main():
#accelerator.log({"message" :"debug enabled"})
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
# wrap objects with accelerate
model, optimizer, trainloader, devloader = accelerator.prepare(model, optimizer, trainloader, devloader)
@@ -76,57 +74,45 @@ def main():
for epoch in range(epochs):
if (epoch - 1) % dev_after == 0:
if (epoch + 1) % dev_after == 0:
stage = Stage.DEV
log = run()
print(f"dev loss: {log}")
else:
stage = Stage.TRAIN
log = run()
print(f"train loss: {log}")
print(f"dev loss: {log:.3f}")
stage = Stage.TRAIN
log = run()
print(f"train loss: {log:.3f}")
torch.save(model.state_dict(), paths('model') / 'torch_clf.pth')
@click.command('validate')
def validate():
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
embeddings = np.load(paths('data') / 'embeddings.npy')
embedding_ids = pd.DataFrame(np.load(paths('data') / 'embedding_ids.npy'), columns=['id']).reset_index()
embedding_length = int(os.getenv('EMBEDDING_LENGTH', 384))
model = Classifier(embedding_length=embedding_length, classes=5)
model.load_state_dict(torch.load(paths('model') / 'torch_clf.pth'))
model.eval()
dataset = NewsDataset()
y = dataset[:][1]
with torch.no_grad():
out = model(torch.tensor(dataset[:][0]))
sns.histplot(pd.DataFrame(out).melt(), x='value', hue='variable', palette='rainbow')
out_path = (paths('data') / 'runs')
out_path.mkdir(exist_ok=True)
plt.savefig(out_path / 'label_hist.png')
plt.close()
y_pred = out.argmax(axis=1)
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(y, y_pred, ax=ax)
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels(), yticklabels=ticklabels())
plt.savefig(out_path / 'confusion_matrix.png')
plt.close()
breakpoint()
from data.main import data_dir, connect
import numpy as np
import pandas as pd
from bias import int_to_label
embeddings = dataset.embeddings
embedding_ids = dataset.data
DB = connect()
query = """
SELECT
s.id
,title
,p.name
,count(1) over (partition by publisher_id) as stories
FROM stories s
JOIN publishers p
on p.id = s.publisher_id
WHERE s.publisher_id NOT IN (
SELECT
id
FROM publisher_bias b
)
"""
data = DB.sql(query).df()
embeddings = np.load(data_dir() / 'embeddings.npy')
embedding_ids = pd.DataFrame(np.load(data_dir() / 'embedding_ids.npy'), columns=['id']).reset_index()
for i in range(10):
embedding = embeddings[embedding_ids[embedding_ids['id'] == data.iloc[i]['id']]['index']]
title = data.iloc[i]['title']
publisher = data.iloc[i]['name']
class_pred = nn.functional.softmax( model(torch.tensor(embedding))).detach()
class_id = int(torch.argmax(nn.functional.softmax( model(torch.tensor(embedding))).detach()))
print(f"{publisher}: {int_to_label(class_id)} - \"{title}\"")
embedding_ids['id'] == data.iloc[0]['id']
embedding_ids[embedding_ids['id'] == data.iloc[0]['id']]
embedding = embeddings[embedding_ids[embedding_ids['id'] == data.iloc[0]['id']]['index']]
title
publisher
model().get_last_layer(torch.tensor(embedding))

View File

@@ -1,4 +1,5 @@
from torch import nn
from transformers import BertPreTrainedModel, BertModel
class Classifier(nn.Module):
def __init__(self, embedding_length: int, classes: int):
@@ -26,3 +27,47 @@ class Classifier(nn.Module):
def get_last_layer(self, x):
x = self.stack(x)
return x
class BertForMultiLabelClassification(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.loss_fct = nn.BCEWithLogitsLoss()
self.init_weights()
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss = self.loss_fct(logits, labels)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)