rough draft of paper done.
This commit is contained in:
111
src/cli.py
111
src/cli.py
@@ -2,70 +2,85 @@ import click
|
||||
from dotenv import load_dotenv
|
||||
import data
|
||||
import plots
|
||||
import mining
|
||||
import train
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
...
|
||||
|
||||
@cli.group(name="data")
|
||||
def data_subcommand():
|
||||
"""data subcommands"""
|
||||
...
|
||||
|
||||
@cli.group(name="mining")
|
||||
def mining_subcommand():
|
||||
"""mining subcommands"""
|
||||
...
|
||||
|
||||
@cli.group(name="plot")
|
||||
def plot_subcommand():
|
||||
"""plotting subcommands"""
|
||||
...
|
||||
|
||||
@cli.group(name="train")
|
||||
def train_subcommand():
|
||||
"""train subcommands"""
|
||||
...
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_dotenv()
|
||||
|
||||
# original bias ratings
|
||||
cli.add_command(data.scrape.download)
|
||||
cli.add_command(data.scrape.parse)
|
||||
cli.add_command(data.scrape.load)
|
||||
cli.add_command(data.scrape.normalize)
|
||||
cli.add_command(data.scrape.create_elections_table)
|
||||
data_subcommand.add_command(data.scrape.download)
|
||||
data_subcommand.add_command(data.scrape.parse)
|
||||
data_subcommand.add_command(data.scrape.load)
|
||||
data_subcommand.add_command(data.scrape.normalize)
|
||||
data_subcommand.add_command(data.scrape.create_elections_table)
|
||||
|
||||
cli.add_command(data.factcheck.parse_index)
|
||||
cli.add_command(data.factcheck.scrape)
|
||||
data_subcommand.add_command(data.factcheck.parse_index)
|
||||
data_subcommand.add_command(data.factcheck.scrape)
|
||||
|
||||
cli.add_command(data.links.create_table)
|
||||
cli.add_command(data.links.create_pca)
|
||||
cli.add_command(data.links.create_clusters)
|
||||
data_subcommand.add_command(data.links.create_table)
|
||||
data_subcommand.add_command(data.links.create_pca)
|
||||
data_subcommand.add_command(data.links.create_clusters)
|
||||
|
||||
import word
|
||||
# cli.add_command(word.distance)
|
||||
# cli.add_command(word.train)
|
||||
cli.add_command(word.embed)
|
||||
cli.add_command(word.max_sequence)
|
||||
import bias
|
||||
cli.add_command(bias.parse)
|
||||
cli.add_command(bias.load)
|
||||
cli.add_command(bias.normalize)
|
||||
data_subcommand.add_command(data.bias.parse)
|
||||
data_subcommand.add_command(data.bias.load)
|
||||
data_subcommand.add_command(data.bias.normalize)
|
||||
|
||||
import mine
|
||||
cli.add_command(mine.embeddings)
|
||||
cli.add_command(mine.cluster)
|
||||
cli.add_command(mine.plot)
|
||||
data_subcommand.add_command(data.emotion.extract)
|
||||
data_subcommand.add_command(data.emotion.normalize)
|
||||
data_subcommand.add_command(data.emotion.analyze)
|
||||
data_subcommand.add_command(data.emotion.create_table)
|
||||
|
||||
import emotion
|
||||
cli.add_command(emotion.extract)
|
||||
cli.add_command(emotion.normalize)
|
||||
cli.add_command(emotion.analyze)
|
||||
cli.add_command(emotion.create_table)
|
||||
data_subcommand.add_command(data.word.embed)
|
||||
data_subcommand.add_command(data.word.max_sequence)
|
||||
data_subcommand.add_command(data.sentence.embed)
|
||||
data_subcommand.add_command(data.sentence.create_avg_pca_table)
|
||||
|
||||
import sentence
|
||||
cli.add_command(sentence.embed)
|
||||
cli.add_command(sentence.create_avg_pca_table)
|
||||
mining_subcommand.add_command(mining.main.embeddings)
|
||||
mining_subcommand.add_command(mining.main.cluster)
|
||||
mining_subcommand.add_command(mining.main.plot)
|
||||
|
||||
from train import main as train_main
|
||||
cli.add_command(train_main.main)
|
||||
plot_subcommand.add_command(plots.descriptive.articles_per_year)
|
||||
plot_subcommand.add_command(plots.descriptive.distinct_publishers)
|
||||
plot_subcommand.add_command(plots.descriptive.stories_per_publisher)
|
||||
plot_subcommand.add_command(plots.descriptive.top_publishers)
|
||||
plot_subcommand.add_command(plots.descriptive.common_tld)
|
||||
plot_subcommand.add_command(plots.sentence.sentence_pca)
|
||||
plot_subcommand.add_command(plots.sentence.avg_sentence_pca)
|
||||
plot_subcommand.add_command(plots.emotion.emotion_over_time)
|
||||
plot_subcommand.add_command(plots.emotion.emotion_regression)
|
||||
plot_subcommand.add_command(plots.sentiment.over_time)
|
||||
plot_subcommand.add_command(plots.sentiment.bias_over_time)
|
||||
plot_subcommand.add_command(plots.sentiment.bias_vs_recent_winner)
|
||||
plot_subcommand.add_command(plots.links.elbow)
|
||||
plot_subcommand.add_command(plots.links.link_pca_clusters)
|
||||
plot_subcommand.add_command(plots.classifier.pca_with_classes)
|
||||
|
||||
cli.add_command(plots.descriptive.articles_per_year)
|
||||
cli.add_command(plots.descriptive.distinct_publishers)
|
||||
cli.add_command(plots.descriptive.stories_per_publisher)
|
||||
cli.add_command(plots.descriptive.top_publishers)
|
||||
cli.add_command(plots.descriptive.common_tld)
|
||||
cli.add_command(plots.sentence.sentence_pca)
|
||||
cli.add_command(plots.sentence.avg_sentence_pca)
|
||||
cli.add_command(plots.emotion.emotion_over_time)
|
||||
cli.add_command(plots.emotion.emotion_regression)
|
||||
cli.add_command(plots.sentiment.over_time)
|
||||
cli.add_command(plots.sentiment.bias_over_time)
|
||||
cli.add_command(plots.sentiment.bias_vs_recent_winner)
|
||||
cli.add_command(plots.links.elbow)
|
||||
cli.add_command(plots.links.link_pca_clusters)
|
||||
cli.add_command(plots.classifier.pca_with_classes)
|
||||
train_subcommand.add_command(train.main.main)
|
||||
train_subcommand.add_command(train.main.validate)
|
||||
|
||||
cli()
|
||||
|
||||
@@ -2,9 +2,24 @@ import data.main
|
||||
import data.scrape
|
||||
import data.factcheck
|
||||
import data.links
|
||||
import data.bias
|
||||
import data.emotion
|
||||
import data.broken_links
|
||||
import data.selection
|
||||
import data.sentence
|
||||
import data.sentiment
|
||||
import data.word
|
||||
|
||||
__all__ = [
|
||||
'main'
|
||||
,'scrape'
|
||||
,'factcheck'
|
||||
,'links'
|
||||
,'bias'
|
||||
,'emotion'
|
||||
,'broken_links'
|
||||
,'selection'
|
||||
,'sentence'
|
||||
,'sentiment'
|
||||
,'word'
|
||||
]
|
||||
|
||||
@@ -3,23 +3,20 @@ import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import click
|
||||
|
||||
from data import connect
|
||||
from data.main import connect
|
||||
|
||||
@click.command(name="broken:crawl")
|
||||
def crawl():
|
||||
"""crawl story urls checking for link rot or redirects."""
|
||||
DB = connect()
|
||||
|
||||
urls = DB.query("""
|
||||
select
|
||||
id
|
||||
,url
|
||||
from stories
|
||||
order by published_at asc
|
||||
limit 5
|
||||
""").fetchall()
|
||||
|
||||
DB.close()
|
||||
with connect() as db:
|
||||
urls = db.query("""
|
||||
select
|
||||
id
|
||||
,url
|
||||
from stories
|
||||
order by published_at asc
|
||||
limit 5
|
||||
""").fetchall()
|
||||
|
||||
story_id, url = urls[1]
|
||||
# url
|
||||
@@ -5,7 +5,7 @@ import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from transformers import BertTokenizer
|
||||
from model import BertForMultiLabelClassification
|
||||
from train.model import BertForMultiLabelClassification
|
||||
from data.main import connect, data_dir
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
@@ -8,7 +8,7 @@ from pathlib import Path
|
||||
import os
|
||||
import sys
|
||||
import click
|
||||
from data.main import connect, map_tld, paths
|
||||
from data.main import connect, map_tld, paths, reporting_label_to_int
|
||||
from random import randint
|
||||
from time import sleep
|
||||
from tqdm import tqdm
|
||||
@@ -155,7 +155,7 @@ def create_tables():
|
||||
FROM stories s
|
||||
""").df()
|
||||
|
||||
stories['tld'] = stories.url.apply(map_tld)
|
||||
raw_stories['tld'] = raw_stories.url.apply(map_tld)
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
@@ -167,5 +167,25 @@ def create_tables():
|
||||
JOIN mbfc.publishers p
|
||||
ON p.tld = s.tld
|
||||
""")
|
||||
with connect() as db:
|
||||
data = db.sql("""
|
||||
select
|
||||
id,
|
||||
reporting
|
||||
from mbfc.publishers p
|
||||
""").df()
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
alter table mbfc.publishers add column reporting_ordinal int
|
||||
""")
|
||||
|
||||
data['ordinal'] = data.reporting.apply(reporting_label_to_int)
|
||||
|
||||
with connect() as db:
|
||||
db.sql("""
|
||||
update mbfc.publishers
|
||||
set reporting_ordinal = data.ordinal
|
||||
from data
|
||||
where data.id = publishers.id
|
||||
""")
|
||||
|
||||
@@ -22,6 +22,8 @@ def paths(name='app'):
|
||||
return Path(os.environ['DATA_MINING_DOCS_DIR'])
|
||||
if 'figure' in name:
|
||||
return Path(os.environ['DATA_MINING_DOCS_DIR']) / 'figures'
|
||||
if 'model' in name:
|
||||
return Path(os.environ['DATA_MINING_DATA_DIR']) / 'models'
|
||||
|
||||
def connect():
|
||||
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
||||
@@ -105,3 +107,32 @@ def bias_int_to_label(class_id: int, source: str = 'mbfc') -> str:
|
||||
except:
|
||||
print(f"no mapping for {class_id}", file=sys.stderr)
|
||||
return -1
|
||||
|
||||
def reporting_label_to_int(label):
|
||||
mapping = {
|
||||
'Very Low': 0,
|
||||
'Low': 1,
|
||||
'Mixed': -1,
|
||||
'Mostly Factual': 3,
|
||||
'High': 4,
|
||||
'Very High': 5
|
||||
}
|
||||
try:
|
||||
return mapping[label]
|
||||
except:
|
||||
return -1
|
||||
|
||||
def save_model(model, name):
|
||||
import pickle
|
||||
save_to = paths('models') / name
|
||||
with open(save_to, 'wb') as file:
|
||||
pickle.dump(model, file)
|
||||
print(f"saved model: {save_to}")
|
||||
|
||||
def load_model(name):
|
||||
import pickle
|
||||
open_from = paths('models') / name
|
||||
print(f"loading model: {open_from}")
|
||||
with open(open_from, 'rb') as file:
|
||||
model = pickle.load(file)
|
||||
return model
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
import click
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from data.main import connect, paths
|
||||
import os
|
||||
from pathlib import Path
|
||||
from data.main import connect, paths, save_model, load_model, ticklabels
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
import click
|
||||
|
||||
#Mean Pooling - Take attention mask into account for correct averaging
|
||||
def mean_pooling(model_output, attention_mask):
|
||||
@@ -24,15 +22,14 @@ def embed(chunks):
|
||||
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
||||
|
||||
# load data
|
||||
DB = connect()
|
||||
table = DB.sql("""
|
||||
select
|
||||
id
|
||||
,title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
DB.close()
|
||||
with connect() as db:
|
||||
table = db.sql("""
|
||||
select
|
||||
id
|
||||
,title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
|
||||
# normalize text
|
||||
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
||||
@@ -67,7 +64,7 @@ def embed(chunks):
|
||||
print(f"embeddings saved: {save_to}")
|
||||
|
||||
# save ids
|
||||
save_to = data_dir() / 'embedding_ids.npy'
|
||||
save_to = paths('data') / 'embedding_ids.npy'
|
||||
np.save(save_to, ids)
|
||||
print(f"ids saved: {save_to}")
|
||||
|
||||
@@ -133,25 +130,28 @@ def create_pca_table():
|
||||
|
||||
embeddings = np.load(path('data') / 'embeddings.npy')
|
||||
embedding_ids = np.load(path('data') / 'embedding_ids.npy')
|
||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
,b.ordinal
|
||||
,p.bias
|
||||
,p.ordinal
|
||||
FROM ids
|
||||
JOIN top.stories s
|
||||
JOIN stories s
|
||||
ON ids.story_id = s.id
|
||||
JOIN top.publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON s.id = ps.story_id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
WHERE p.ordinal != -1
|
||||
""").df()
|
||||
pub = db.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.publishers
|
||||
FROM mbfc.publishers
|
||||
""").df()
|
||||
|
||||
x = embeddings[data['index']]
|
||||
@@ -161,8 +161,7 @@ def create_pca_table():
|
||||
data['first'] = pred[:, 0]
|
||||
data['second'] = pred[:, 1]
|
||||
|
||||
table_name = f"top.story_embeddings_pca"
|
||||
|
||||
table_name = f"story_embeddings_pca"
|
||||
with connect() as db:
|
||||
db.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
@@ -172,11 +171,12 @@ def create_pca_table():
|
||||
,data.second as second
|
||||
FROM data
|
||||
""")
|
||||
|
||||
print(f"created {table_name}")
|
||||
|
||||
@click.command('sentence:create-svm-table')
|
||||
def create_svm_table():
|
||||
"""sentence to classifier"""
|
||||
|
||||
from sklearn import svm
|
||||
from sklearn.linear_model import SGDClassifier
|
||||
|
||||
@@ -189,22 +189,99 @@ def create_svm_table():
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
,b.ordinal
|
||||
,p.ordinal
|
||||
,p.bias
|
||||
FROM ids
|
||||
JOIN top.stories s
|
||||
JOIN stories s
|
||||
ON ids.story_id = s.id
|
||||
JOIN top.publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON s.id = ps.story_id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
WHERE p.ordinal != -1
|
||||
""").df()
|
||||
|
||||
x = embeddings[data['index']]
|
||||
#y = data['ordinal'].to_numpy().reshape(-1, 1)
|
||||
y = data['ordinal']
|
||||
|
||||
model = SGDClassifier()
|
||||
pred = model.fit(x, y)
|
||||
data['pred'] = pred.predict(x)
|
||||
data
|
||||
model = model.fit(x, y)
|
||||
# data['pred'] = pred.predict(x)
|
||||
save_model(model, 'sgdclassifier.pkl')
|
||||
|
||||
def interence():
|
||||
|
||||
with connect() as db:
|
||||
bias = db.query("""
|
||||
SELECT
|
||||
p.bias
|
||||
,p.ordinal
|
||||
FROM mbfc.publishers p
|
||||
WHERE p.ordinal != -1
|
||||
GROUP BY
|
||||
p.bias
|
||||
,p.ordinal
|
||||
ORDER BY
|
||||
p.ordinal
|
||||
""").df()
|
||||
|
||||
sdg = load_model( 'sgdclassifier.pkl')
|
||||
|
||||
|
||||
tokens = tokenizer(["hello, i hate woke culture.", "trump is winning"], padding=True, truncation=True, return_tensors='pt')
|
||||
|
||||
with torch.no_grad():
|
||||
output = model(**tokens)
|
||||
|
||||
output = mean_pooling(output, tokens['attention_mask'])
|
||||
|
||||
output = F.normalize(output, p=2, dim=1)
|
||||
sdg.predict(output)
|
||||
|
||||
tokens
|
||||
dir(output)
|
||||
|
||||
def validation():
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||
embedding_ids = np.load(paths('data') / 'embedding_ids.npy')
|
||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||
|
||||
with connect() as db:
|
||||
data = db.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
,p.ordinal
|
||||
,p.bias
|
||||
FROM ids
|
||||
JOIN stories s
|
||||
ON ids.story_id = s.id
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON s.id = ps.story_id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
WHERE p.ordinal != -1
|
||||
""").df()
|
||||
|
||||
x = embeddings[data['index']]
|
||||
y = data['ordinal']
|
||||
|
||||
|
||||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
|
||||
|
||||
clf = LinearSVC()
|
||||
clf.fit(x_train, y_train)
|
||||
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 5))
|
||||
ConfusionMatrixDisplay.from_predictions(y_test, clf.predict(x_test), ax=ax)
|
||||
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels(), yticklabels=ticklabels())
|
||||
plt.show()
|
||||
|
||||
plt.savefig(save_to)
|
||||
@@ -20,15 +20,14 @@ def extract(chunks):
|
||||
|
||||
|
||||
# load data
|
||||
DB = connect()
|
||||
table = DB.sql("""
|
||||
select
|
||||
id
|
||||
,title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
DB.close()
|
||||
with connect() as db:
|
||||
table = db.sql("""
|
||||
select
|
||||
id
|
||||
,title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
|
||||
# normalize text
|
||||
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
||||
@@ -56,12 +55,12 @@ def extract(chunks):
|
||||
story_ids = np.concatenate(story_ids)
|
||||
|
||||
# save embeddings
|
||||
save_to = data_dir() / 'sentiment.npy'
|
||||
save_to = paths('data') / 'sentiment.npy'
|
||||
np.save(save_to, sentiments)
|
||||
print(f"sentiments saved: {save_to}")
|
||||
|
||||
# save ids
|
||||
save_to = data_dir() / 'sentiment_ids.npy'
|
||||
save_to = paths('data') / 'sentiment_ids.npy'
|
||||
np.save(save_to, story_ids)
|
||||
print(f"ids saved: {save_to}")
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import click
|
||||
from transformers import AutoTokenizer, RobertaModel
|
||||
import numpy as np
|
||||
from data.main import Data, from_db, connect, data_dir
|
||||
from data.main import connect, paths
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
from pathlib import Path
|
||||
@@ -9,30 +9,23 @@ from pathlib import Path
|
||||
@click.command(name="word:max-sequence")
|
||||
def max_sequence():
|
||||
"""calculate the maximum token length given the story titles"""
|
||||
db = connect()
|
||||
longest = db.sql("""
|
||||
select
|
||||
title
|
||||
from stories
|
||||
order by length(title) desc
|
||||
limit 5000
|
||||
""").df()
|
||||
db.close()
|
||||
with connect() as db:
|
||||
longest = db.sql("""
|
||||
select
|
||||
title
|
||||
from stories
|
||||
order by length(title) desc
|
||||
limit 5000
|
||||
""").df()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||
tokens = tokenizer(longest['title'].to_list())
|
||||
print(f"{max([len(x) for x in tokens['input_ids']])}")
|
||||
|
||||
@click.command(name="word:train")
|
||||
def train():
|
||||
"""TODO"""
|
||||
table = from_db(Data.Titles)
|
||||
n_classes = 10
|
||||
|
||||
@click.command(name="word:embed")
|
||||
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
|
||||
@click.option('--embedding_dir', help="path to save embeddings as np array", type=Path, default=Path(data_dir() / 'embeddings'), show_default=True)
|
||||
@click.option('--token_dir', help="path to save tokens as np array", type=Path, default=Path(data_dir() / 'tokens'), show_default=True)
|
||||
@click.option('--embedding_dir', help="path to save embeddings as np array", type=Path, default=Path(paths('data') / 'embeddings'), show_default=True)
|
||||
@click.option('--token_dir', help="path to save tokens as np array", type=Path, default=Path(paths('data') / 'tokens'), show_default=True)
|
||||
@click.option('--device', help="device to process data on", type=str, default="cuda:0", show_default=True)
|
||||
def embed(chunks, embedding_dir, token_dir, device):
|
||||
""" given titles, generate tokens and word embeddings and saves to disk """
|
||||
@@ -44,14 +37,13 @@ def embed(chunks, embedding_dir, token_dir, device):
|
||||
model.to(device)
|
||||
|
||||
# load data
|
||||
db = connect()
|
||||
table = db.sql("""
|
||||
select
|
||||
title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
db.close()
|
||||
with connect() as db:
|
||||
table = db.sql("""
|
||||
select
|
||||
title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
|
||||
# normalize text
|
||||
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
||||
@@ -82,7 +74,7 @@ def distance():
|
||||
closest = np.unravel_index(min_index, distances.shape)
|
||||
distances.flatten().shape
|
||||
|
||||
# path = data_dir() / 'embeddings'
|
||||
# path = paths('data') / 'embeddings'
|
||||
# chunks = [x for x in path.iterdir() if x.match('*.npy')]
|
||||
# chunks = sorted(chunks, key=lambda x: int(x.stem.split('_')[1]))
|
||||
#
|
||||
@@ -98,4 +90,4 @@ def distance():
|
||||
#
|
||||
# data.shape
|
||||
#
|
||||
# np.save(data, data_dir() / 'embeddings.npy')
|
||||
# np.save(data, paths('data') / 'embeddings.npy')
|
||||
9
src/mining/__init__.py
Normal file
9
src/mining/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
import mining.main
|
||||
import mining.apriori
|
||||
import mining.bias
|
||||
|
||||
__all__ = [
|
||||
'main'
|
||||
,'apriori'
|
||||
,'bias'
|
||||
]
|
||||
@@ -1,3 +1,5 @@
|
||||
import click
|
||||
|
||||
from efficient_apriori import apriori
|
||||
from data.main import connect
|
||||
|
||||
46
src/model.py
46
src/model.py
@@ -15,49 +15,3 @@ class Model(nn.Module):
|
||||
outs = self.act(self.linear(outs.last_hidden_state))
|
||||
return outs
|
||||
|
||||
import torch.nn as nn
|
||||
from transformers import BertPreTrainedModel, BertModel
|
||||
|
||||
|
||||
class BertForMultiLabelClassification(BertPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.bert = BertModel(config)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
|
||||
self.loss_fct = nn.BCEWithLogitsLoss()
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
):
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
)
|
||||
pooled_output = outputs[1]
|
||||
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
|
||||
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||
|
||||
if labels is not None:
|
||||
loss = self.loss_fct(logits, labels)
|
||||
outputs = (loss,) + outputs
|
||||
|
||||
return outputs # (loss), logits, (hidden_states), (attentions)
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
import pandas as pd
|
||||
import math
|
||||
|
||||
df = pd.read_csv('/tmp/attr.csv')
|
||||
((((df.left - 9.1) ** 2) + ((df.right - 11.0) ** 2)) ** 0.5).sort_values()
|
||||
@@ -3,6 +3,7 @@ import plots.emotion
|
||||
import plots.sentiment
|
||||
import plots.links
|
||||
import plots.classifier
|
||||
import plots.descriptive
|
||||
|
||||
__all__ = [
|
||||
'sentence'
|
||||
@@ -10,4 +11,5 @@ __all__ = [
|
||||
'sentiment',
|
||||
'links',
|
||||
'classifier',
|
||||
'descriptive',
|
||||
]
|
||||
|
||||
@@ -5,7 +5,7 @@ import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
|
||||
@click.command('plot:pca-with-classes')
|
||||
@click.command('classifier:pca-with-classes')
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def pca_with_classes(source):
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
||||
@click.command('plot:articles-per-year')
|
||||
@click.command('descriptive:articles-per-year')
|
||||
def articles_per_year():
|
||||
save_to = paths('figures') / 'articles_per_year.png'
|
||||
|
||||
@@ -27,29 +27,34 @@ def articles_per_year():
|
||||
plt.savefig(save_to)
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:distinct-publishers')
|
||||
@click.command('descriptive:distinct-publishers')
|
||||
def distinct_publishers():
|
||||
save_to = paths('figures') / 'distinct_publishers.png'
|
||||
|
||||
with connect() as db:
|
||||
data = DB.query("""
|
||||
data = db.query("""
|
||||
select
|
||||
year(published_at) as year
|
||||
,count(distinct publisher_id) as publishers
|
||||
from stories
|
||||
group by
|
||||
year(published_at)
|
||||
count(distinct p.id) as publishers
|
||||
,date_trunc('year', s.published_at) as date
|
||||
from stories s
|
||||
join mbfc.publisher_stories ps
|
||||
on s.id = ps.story_id
|
||||
join mbfc.publishers p
|
||||
on ps.publisher_id = p.id
|
||||
and year(s.published_at) not in (2005, 2023)
|
||||
group by
|
||||
date_trunc('year', s.published_at)
|
||||
""").df()
|
||||
|
||||
ax = sns.barplot(x=data.year, y=data.publishers, color='tab:blue')
|
||||
ax.tick_params(axis='x', rotation=90)
|
||||
ax.set(title="count of publishers per year", ylabel="count of publishers (#)")
|
||||
ax = sns.barplot(x=data.date.dt.year, y=data.publishers, color='tab:blue')
|
||||
ax.tick_params(axis='x', rotation=45)
|
||||
ax.set(ylabel="count of publishers (#)", xlabel="year")
|
||||
plt.tight_layout()
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:stories-per-publisher')
|
||||
@click.command('descriptive:stories-per-publisher')
|
||||
def stories_per_publisher():
|
||||
save_to = paths('figures') / 'stories_per_publisher.png'
|
||||
|
||||
@@ -100,7 +105,7 @@ def stories_per_publisher():
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
|
||||
@click.command('plot:top-publishers')
|
||||
@click.command('descriptive:top-publishers')
|
||||
def top_publishers():
|
||||
"""plot top publishers over time"""
|
||||
|
||||
@@ -164,7 +169,7 @@ def top_publishers():
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
|
||||
@click.command('plot:common_tld')
|
||||
@click.command('descriptive:common_tld')
|
||||
def common_tld():
|
||||
import dataframe_image as dfi
|
||||
save_to = paths('figures') / 'common_tld.png'
|
||||
@@ -189,42 +194,71 @@ def common_tld():
|
||||
def stats():
|
||||
|
||||
# raw
|
||||
DB.query("""
|
||||
SELECT
|
||||
'total stories' as key
|
||||
,COUNT(1) as value
|
||||
FROM stories
|
||||
UNION
|
||||
SELECT
|
||||
'total related' as key
|
||||
,COUNT(1) as value
|
||||
FROM related_stories
|
||||
UNION
|
||||
SELECT
|
||||
'top level domains' as key
|
||||
,COUNT(distinct tld) as value
|
||||
FROM stories
|
||||
UNION
|
||||
SELECT
|
||||
'publishers' as key
|
||||
,COUNT(1) as value
|
||||
FROM publishers
|
||||
UNION
|
||||
SELECT
|
||||
'authors' as key
|
||||
,COUNT(distinct author) as value
|
||||
FROM stories
|
||||
UNION
|
||||
SELECT
|
||||
'min year' as key
|
||||
,min(year(published_at)) as value
|
||||
FROM stories
|
||||
UNION
|
||||
SELECT
|
||||
'max year' as key
|
||||
,max(year(published_at)) as value
|
||||
FROM stories
|
||||
""").df().to_markdown(index=False)
|
||||
with connect() as db:
|
||||
db.query("""
|
||||
SELECT
|
||||
'total stories' as key
|
||||
,COUNT(1) as value
|
||||
FROM stories
|
||||
UNION
|
||||
SELECT
|
||||
'total related' as key
|
||||
,COUNT(1) as value
|
||||
FROM related_stories
|
||||
UNION
|
||||
SELECT
|
||||
'top level domains' as key
|
||||
,COUNT(distinct tld) as value
|
||||
FROM stories
|
||||
UNION
|
||||
SELECT
|
||||
'publishers' as key
|
||||
,COUNT(1) as value
|
||||
FROM mbfc.publishers
|
||||
UNION
|
||||
SELECT
|
||||
'authors' as key
|
||||
,COUNT(distinct author) as value
|
||||
FROM stories
|
||||
UNION
|
||||
SELECT
|
||||
'years' as key
|
||||
,min(year(published_at)) || '-' || min(year(published_at)) as value
|
||||
FROM stories
|
||||
UNION
|
||||
SELECT
|
||||
'max year' as key
|
||||
,max(year(published_at)) as value
|
||||
FROM stories
|
||||
UNION
|
||||
SELECT
|
||||
'publishers with ratings' as key
|
||||
,count(distinct ps.publisher_id)
|
||||
FROM mbfc.publisher_stories ps
|
||||
UNION
|
||||
SELECT
|
||||
'publishers without ratings' as key
|
||||
,count(distinct s.publisher_id)
|
||||
from stories s
|
||||
left join mbfc.publisher_stories ps
|
||||
on ps.story_id = s.id
|
||||
where ps.publisher_id is null
|
||||
UNION
|
||||
SELECT
|
||||
'stories with ratings' as key
|
||||
,count(distinct ps.story_id)
|
||||
FROM mbfc.publisher_stories ps
|
||||
UNION
|
||||
SELECT
|
||||
'stories without ratings' as key
|
||||
,count(distinct s.id)
|
||||
from stories s
|
||||
left join mbfc.publisher_stories ps
|
||||
on ps.story_id = s.id
|
||||
where ps.publisher_id is null
|
||||
""")
|
||||
|
||||
#.df().to_markdown(index=False)
|
||||
|
||||
# selected
|
||||
DB.query("""
|
||||
@@ -264,7 +298,7 @@ def stats():
|
||||
FROM top.stories
|
||||
""").df().to_markdown(index=False)
|
||||
|
||||
@click.command('plot:bias-stats')
|
||||
@click.command('descriptive:bias-stats')
|
||||
def bias_stats():
|
||||
import dataframe_image as dfi
|
||||
save_to = paths('figures') / 'bias_stats.png'
|
||||
@@ -322,7 +356,7 @@ def bias_stats():
|
||||
DB.close()
|
||||
print(df.to_markdown(index=False))
|
||||
|
||||
@click.command('plot:bias-over-time')
|
||||
@click.command('descriptive:bias-over-time')
|
||||
def bias_over_time():
|
||||
"""plot bias labels over time"""
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
@click.command('plot:emotion-over-time')
|
||||
@click.command('emotion:over-time')
|
||||
def emotion_over_time():
|
||||
|
||||
filename = "emotion_over_time.png"
|
||||
@@ -35,7 +35,7 @@ def emotion_over_time():
|
||||
print(f"saved: {save_to}")
|
||||
os.system(f'xdg-open {save_to}')
|
||||
|
||||
@click.command('plot:emotion-regression')
|
||||
@click.command('emotion:regression')
|
||||
def emotion_regression():
|
||||
"""plot emotion over time as regression"""
|
||||
|
||||
@@ -114,7 +114,7 @@ def emotion_regression():
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:emotion-hist')
|
||||
@click.command('emotion:hist')
|
||||
def emotion_hist():
|
||||
|
||||
filename = "emotion_hist.png"
|
||||
|
||||
@@ -1,16 +1,13 @@
|
||||
import click
|
||||
from data.main import connect
|
||||
from links import to_matrix
|
||||
import os
|
||||
from data.main import connect, ticklabels, paths
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
from sklearn.metrics import silhouette_score
|
||||
import pandas as pd
|
||||
|
||||
|
||||
@click.command('plot:link-elbow')
|
||||
@click.command('links:elbow')
|
||||
def elbow():
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
@@ -42,7 +39,7 @@ def elbow():
|
||||
|
||||
# randomly pick 8
|
||||
|
||||
@click.command('plot:link-pca-clusters')
|
||||
@click.command('links:pca-clusters')
|
||||
@click.option('--source', type=click.Choice(['links', 'normalized', 'onehot']), default='links')
|
||||
def link_pca_clusters(source):
|
||||
|
||||
@@ -57,20 +54,22 @@ def link_pca_clusters(source):
|
||||
,pca.first
|
||||
,pca.second
|
||||
,s.cnt as stories
|
||||
FROM top.publisher_clusters_{source} c
|
||||
JOIN top.publishers p
|
||||
ON c.publisher_id = p.id
|
||||
FROM publisher_clusters_{source} c
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.publisher_id = c.publisher_id
|
||||
JOIN mbfc.publishers p
|
||||
ON ps.publisher_id = p.id
|
||||
JOIN
|
||||
(
|
||||
select
|
||||
s.publisher_id
|
||||
p.id as publisher_id
|
||||
,count(1) as cnt
|
||||
FROM top.stories s
|
||||
FROM mbfc.publishers p
|
||||
GROUP BY
|
||||
s.publisher_id
|
||||
p.id
|
||||
) s
|
||||
ON s.publisher_id = p.id
|
||||
JOIN top.publisher_pca_{source} pca
|
||||
JOIN publisher_pca_{source} pca
|
||||
ON pca.publisher_id = p.id
|
||||
""").df()
|
||||
|
||||
@@ -107,7 +106,7 @@ def test():
|
||||
""")
|
||||
|
||||
|
||||
@click.command('plot:link-confusion')
|
||||
@click.command('links:confusion')
|
||||
def link_confusion():
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
@@ -166,7 +165,7 @@ def link_confusion():
|
||||
plt.close()
|
||||
print(f"saved plot: {save_to}")
|
||||
|
||||
@click.command('plot:link-classifier')
|
||||
@click.command('links:classifier')
|
||||
def link_confusion():
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
@@ -178,18 +177,16 @@ def link_confusion():
|
||||
bias = db.query("""
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,b.ordinal
|
||||
FROM top.publishers p
|
||||
JOIN top.publisher_bias pb
|
||||
ON pb.publisher_id = p.id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
,p.ordinal
|
||||
FROM mbfc.publishers p
|
||||
where ordinal != -1
|
||||
""").df()
|
||||
|
||||
with connect() as db:
|
||||
df = db.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.link_edges
|
||||
FROM link_edges
|
||||
WHERE parent_id in (
|
||||
select
|
||||
publisher_id
|
||||
@@ -203,36 +200,22 @@ def link_confusion():
|
||||
""").df()
|
||||
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
|
||||
x = pivot.values
|
||||
y = bias.sort_values('publisher_id').ordinal
|
||||
|
||||
with connect() as db:
|
||||
data = db.query(f"""
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,pca.first
|
||||
,pca.second
|
||||
FROM top.publisher_pca_onehot pca
|
||||
JOIN top.publishers p
|
||||
ON pca.publisher_id = p.id
|
||||
""").df()
|
||||
|
||||
|
||||
publisher_matrix = pd.merge(pivot, bias, left_on='parent_id', right_on='publisher_id')
|
||||
x = publisher_matrix.loc[:, ~publisher_matrix.columns.isin(['publisher_id', 'ordinal'])].values
|
||||
y = publisher_matrix['ordinal']
|
||||
|
||||
model = KNeighborsClassifier(n_neighbors=5)
|
||||
model.fit(x, y)
|
||||
y_pred = model.predict(x)
|
||||
|
||||
plot = bias.sort_values('publisher_id')
|
||||
plot['pred'] = y_pred
|
||||
data = pd.merge(plot, data)
|
||||
publisher_matrix['pred'] = y_pred
|
||||
publisher_matrix
|
||||
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 5))
|
||||
ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
|
||||
fig, ax = plt.subplots(figsize=(5, 5))
|
||||
ConfusionMatrixDisplay.from_predictions(publisher_matrix['ordinal'], publisher_matrix['pred'], ax=ax)
|
||||
ax.set(xticklabels=ticklabels(), yticklabels=ticklabels())
|
||||
plt.xticks(rotation=45)
|
||||
plt.tight_layout()
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved plot: {save_to}")
|
||||
|
||||
@@ -7,7 +7,7 @@ import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
@click.command('plot:sentence-pca')
|
||||
@click.command('sentence:pca')
|
||||
def sentence_pca():
|
||||
save_to = paths('figures') / "embedding_sentence_pca.png"
|
||||
|
||||
@@ -30,7 +30,7 @@ def sentence_pca():
|
||||
ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
|
||||
plt.savefig(save_to)
|
||||
|
||||
@click.command('plot:avg-sentence-pca')
|
||||
@click.command('sentence:avg-pca')
|
||||
def avg_sentence_pca():
|
||||
save_to = paths('figures') / "avg_embedding_sentence_pca.png"
|
||||
|
||||
@@ -54,7 +54,7 @@ def avg_sentence_pca():
|
||||
ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
|
||||
plt.savefig(save_to)
|
||||
|
||||
@click.command('plot:sentence-confusion')
|
||||
@click.command('sentence:confusion')
|
||||
def sentence_confusion():
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
|
||||
@@ -3,7 +3,7 @@ from data.main import connect, paths, ticklabels
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
@click.command('plot:sentiment-over-time')
|
||||
@click.command('sentiment:over-time')
|
||||
def over_time():
|
||||
|
||||
filename = "sentiment_over_time.png"
|
||||
@@ -28,7 +28,74 @@ def over_time():
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:bias-vs-sentiment-over-time')
|
||||
@click.command('sentiment:bias-over-time')
|
||||
def bias_over_time():
|
||||
"""plot sentiment/bias vs. time"""
|
||||
|
||||
filename = "publisher_avg_sentiment_vs_bias_over_time.png"
|
||||
save_to = paths('figures') / filename
|
||||
|
||||
with connect() as db:
|
||||
data = db.sql("""
|
||||
with cte as (
|
||||
SELECT
|
||||
avg(sent.class_id) as sentiment
|
||||
,date_trunc('yearweek', s.published_at) as date
|
||||
,p.id
|
||||
,p.bias
|
||||
FROM story_sentiments sent
|
||||
JOIN stories s
|
||||
ON s.id = sent.story_id
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.story_id = s.id
|
||||
JOIN mbfc.publishers p
|
||||
ON p.id = ps.publisher_id
|
||||
WHERE p.ordinal != -1
|
||||
and year(date) not in (2005, 2023)
|
||||
GROUP BY
|
||||
date_trunc('yearweek', s.published_at)
|
||||
,p.id
|
||||
,p.bias
|
||||
) ,b as (
|
||||
select
|
||||
avg(sentiment) as sentiment
|
||||
,median(sentiment) as median_sentiment
|
||||
,bias
|
||||
,date
|
||||
from cte
|
||||
group by
|
||||
bias
|
||||
,date
|
||||
)
|
||||
select
|
||||
median(sentiment) OVER (PARTITION BY bias ORDER BY date DESC ROWS BETWEEN 0 PRECEDING AND 7 FOLLOWING) as sentiment
|
||||
,bias
|
||||
,date
|
||||
from b
|
||||
""").df()
|
||||
|
||||
ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels())
|
||||
plt.axhline(y=0.5, color='black', linestyle='--', label='neutral')
|
||||
ax.set(ylabel='8 week rolling avg. sentiment', xlabel='date', ylim=[0,1])
|
||||
plt.tight_layout()
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
# from scipy.stats import pearsonr
|
||||
# pivot = data.pivot(index=['date'], columns=['bias'], values='sentiment')
|
||||
#
|
||||
#
|
||||
# for left in pivot.keys():
|
||||
# for right in pivot.keys():
|
||||
# if left != right:
|
||||
# result = pearsonr(pivot[left], pivot[right])
|
||||
# print(f"{left:<15}/{right:<15} | p: {result.pvalue:.2e} | coef: {result.statistic:.3f}")
|
||||
#
|
||||
# pivot
|
||||
|
||||
|
||||
@click.command('sentiment:bias-over-time')
|
||||
def bias_over_time():
|
||||
"""plot sentiment/bias vs. time"""
|
||||
|
||||
@@ -62,16 +129,15 @@ def bias_over_time():
|
||||
WHERE year(date) not in (2005, 2023)
|
||||
""").df()
|
||||
|
||||
#ax = sns.relplot(data, x='date', y='sentiment', col='bias', palette='rainbow', hue='bias', col_order=ticklabels())
|
||||
ax = sns.lineplot(data, x='date', y='sentiment', palette='rainbow', hue='bias', hue_order=ticklabels())
|
||||
plt.axhline(y=0.5, color='black', linestyle='--', label='neutral')
|
||||
ax.set(title='sentiment and bias vs. time', ylabel='8 week rolling avg. sentiment', xlabel='date')
|
||||
ax.set(ylabel='8 week rolling avg. sentiment', xlabel='date', ylim=[0,1])
|
||||
plt.tight_layout()
|
||||
plt.savefig(save_to)
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:sentiment-recent-winner')
|
||||
@click.command('sentiment:recent-winner')
|
||||
def bias_vs_recent_winner():
|
||||
"""plot bias vs. distance to election"""
|
||||
|
||||
@@ -106,7 +172,7 @@ def bias_vs_recent_winner():
|
||||
plt.close()
|
||||
print(f"saved: {save_to}")
|
||||
|
||||
@click.command('plot:sentiment-hist')
|
||||
@click.command('sentiment:hist')
|
||||
def sentiment_hist():
|
||||
|
||||
filename = "sentiment_hist.png"
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import train.main
|
||||
import train.model
|
||||
|
||||
__all__ = [
|
||||
'main'
|
||||
,'model'
|
||||
]
|
||||
|
||||
@@ -1,38 +1,104 @@
|
||||
from torch.utils.data import Dataset
|
||||
from data.main import connect, data_dir
|
||||
from bias import label_to_int
|
||||
from data.main import connect, paths
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
class NewsDataset(Dataset):
|
||||
def __init__(self):
|
||||
self.embeddings = np.load(data_dir() / 'embeddings.npy')
|
||||
embedding_ids = pd.DataFrame(np.load(data_dir() / 'embedding_ids.npy'), columns=['id']).reset_index()
|
||||
self.embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||
self.embedding_ids = pd.DataFrame(np.load(paths('data') / 'embedding_ids.npy'), columns=['id']).reset_index()
|
||||
|
||||
DB = connect()
|
||||
query = """
|
||||
SELECT
|
||||
s.id
|
||||
,b.label
|
||||
,count(1) over (partition by publisher_id) as stories
|
||||
FROM stories s
|
||||
JOIN publisher_bias b
|
||||
ON b.id = s.publisher_id
|
||||
WHERE b.label != 'allsides'
|
||||
"""
|
||||
data = DB.sql(query).df()
|
||||
DB.close()
|
||||
|
||||
data['label'] = data['label'].apply(lambda x: label_to_int(x))
|
||||
data = data.merge(embedding_ids)
|
||||
self.data = data
|
||||
with connect() as db:
|
||||
self.data = db.sql("""
|
||||
WITH cte AS (
|
||||
SELECT
|
||||
s.id
|
||||
,p.ordinal
|
||||
,date_part('epoch', s.published_at) as epoch
|
||||
,count(1) over(partition by p.id) as publisher_stories
|
||||
,row_number() over(partition by p.ordinal) as label_row
|
||||
FROM stories s
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.story_id = s.id
|
||||
JOIN mbfc.publishers p
|
||||
ON ps.publisher_id = p.id
|
||||
WHERE p.ordinal != -1
|
||||
)
|
||||
SELECT
|
||||
id
|
||||
,epoch
|
||||
,publisher_stories
|
||||
,ordinal
|
||||
FROM cte
|
||||
WHERE label_row < 40000
|
||||
""").df()
|
||||
self.data = self.data.merge(self.embedding_ids)
|
||||
self.data['epoch_norm'] = (self.data['epoch'] - self.data['epoch'].min())/(self.data['epoch'].max()-self.data['epoch'].min())
|
||||
self.data['publisher_stories_norm'] = (self.data['publisher_stories'] - self.data['publisher_stories'].min())/(self.data['publisher_stories'].max()-self.data['publisher_stories'].min())
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
row = self.data.iloc[idx]
|
||||
y = row['label']
|
||||
# x = np.concatenate((self.embeddings[row['index']], [row['stories']])).astype(np.float32)
|
||||
x = self.embeddings[row['index']]
|
||||
y = int(row['ordinal'])
|
||||
x = self.embeddings[int(row['index'])]
|
||||
x = np.append(x, row[['epoch_norm', 'publisher_stories_norm']].values).astype(np.float32)
|
||||
return x, y
|
||||
|
||||
def normalized_epoch(self, idx):
|
||||
epoch = self.data['epoch']
|
||||
return (epoch.iloc[idx]-epoch.min())/(epoch.max()-epoch.min())
|
||||
|
||||
def normalized_stories(self, idx):
|
||||
count = self.data['publisher_stories']
|
||||
return (count.iloc[idx]-count.min())/(count.max()-count.min())
|
||||
|
||||
def get_in_out_size(self):
|
||||
return int(os.getenv('EMBEDDING_LENGTH', 384)), int(os.getenv('CLASSES', 5)),
|
||||
|
||||
class PublisherDataset(Dataset):
|
||||
def __init__(self):
|
||||
embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||
embedding_ids = pd.DataFrame(np.load(paths('data') / 'embedding_ids.npy'), columns=['id']).reset_index()
|
||||
|
||||
with connect() as db:
|
||||
data = db.sql("""
|
||||
WITH cte AS (
|
||||
SELECT
|
||||
s.id
|
||||
,p.id as publisher_id
|
||||
,p.ordinal
|
||||
,row_number() over(partition by p.ordinal) as label_row
|
||||
FROM stories s
|
||||
JOIN mbfc.publisher_stories ps
|
||||
ON ps.story_id = s.id
|
||||
JOIN mbfc.publishers p
|
||||
ON ps.publisher_id = p.id
|
||||
WHERE p.ordinal != -1
|
||||
)
|
||||
SELECT
|
||||
id
|
||||
,ordinal
|
||||
,publisher_id
|
||||
FROM cte
|
||||
WHERE label_row < 40000
|
||||
""").df()
|
||||
|
||||
data = data.merge(self.embedding_ids)
|
||||
self.x = []
|
||||
self.y = []
|
||||
for (publisher_id, ordinal), group in data.groupby(['publisher_id', 'ordinal'])[['ordinal', 'index']]:
|
||||
self.x.append(embeddings[group['index']].mean(axis=0))
|
||||
self.y.append(ordinal)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.x)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self.x[idx], self.y[idx]
|
||||
|
||||
def get_in_out_size(self):
|
||||
return int(os.getenv('EMBEDDING_LENGTH', 384)), int(os.getenv('CLASSES', 5)),
|
||||
|
||||
|
||||
@@ -5,34 +5,32 @@ from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch import optim
|
||||
from torch.utils.data import DataLoader
|
||||
from accelerate import Accelerator
|
||||
|
||||
from train.dataset import NewsDataset
|
||||
from train.model import Classifier
|
||||
#from model.linear import LinearClassifier
|
||||
from data.main import paths, connect, ticklabels
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
class Stage(Enum):
|
||||
TRAIN = auto()
|
||||
DEV = auto()
|
||||
|
||||
@click.command('train:main')
|
||||
def main():
|
||||
dev_after = 20
|
||||
@click.command('main')
|
||||
@click.option('--epochs', default=10, type=int)
|
||||
def main(epochs):
|
||||
dev_after = 5
|
||||
visible_devices = None
|
||||
lr = 1e-4
|
||||
epochs = 10
|
||||
debug = False
|
||||
torch.manual_seed(0)
|
||||
num_workers = 0
|
||||
|
||||
num_workers = int(os.getenv('NUMBER_OF_WORKERS', 0))
|
||||
embedding_length = int(os.getenv('EMBEDDING_LENGTH', 384))
|
||||
|
||||
dataset = NewsDataset()
|
||||
trainset, devset = torch.utils.data.random_split(dataset, [0.8, 0.2])
|
||||
batch_size = 512
|
||||
batch_size = int(os.getenv('BATCH_SIZE', 512))
|
||||
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True)
|
||||
devloader = DataLoader(devset, shuffle=False, num_workers=num_workers)
|
||||
accelerator = Accelerator()
|
||||
@@ -46,7 +44,7 @@ def main():
|
||||
#accelerator.log({"message" :"debug enabled"})
|
||||
|
||||
criterion = torch.nn.CrossEntropyLoss()
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
|
||||
|
||||
# wrap objects with accelerate
|
||||
model, optimizer, trainloader, devloader = accelerator.prepare(model, optimizer, trainloader, devloader)
|
||||
@@ -76,57 +74,45 @@ def main():
|
||||
|
||||
|
||||
for epoch in range(epochs):
|
||||
if (epoch - 1) % dev_after == 0:
|
||||
if (epoch + 1) % dev_after == 0:
|
||||
stage = Stage.DEV
|
||||
log = run()
|
||||
print(f"dev loss: {log}")
|
||||
else:
|
||||
stage = Stage.TRAIN
|
||||
log = run()
|
||||
print(f"train loss: {log}")
|
||||
print(f"dev loss: {log:.3f}")
|
||||
stage = Stage.TRAIN
|
||||
log = run()
|
||||
print(f"train loss: {log:.3f}")
|
||||
torch.save(model.state_dict(), paths('model') / 'torch_clf.pth')
|
||||
|
||||
@click.command('validate')
|
||||
def validate():
|
||||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
embeddings = np.load(paths('data') / 'embeddings.npy')
|
||||
embedding_ids = pd.DataFrame(np.load(paths('data') / 'embedding_ids.npy'), columns=['id']).reset_index()
|
||||
|
||||
embedding_length = int(os.getenv('EMBEDDING_LENGTH', 384))
|
||||
model = Classifier(embedding_length=embedding_length, classes=5)
|
||||
model.load_state_dict(torch.load(paths('model') / 'torch_clf.pth'))
|
||||
model.eval()
|
||||
|
||||
dataset = NewsDataset()
|
||||
|
||||
y = dataset[:][1]
|
||||
with torch.no_grad():
|
||||
out = model(torch.tensor(dataset[:][0]))
|
||||
|
||||
sns.histplot(pd.DataFrame(out).melt(), x='value', hue='variable', palette='rainbow')
|
||||
out_path = (paths('data') / 'runs')
|
||||
out_path.mkdir(exist_ok=True)
|
||||
plt.savefig(out_path / 'label_hist.png')
|
||||
plt.close()
|
||||
|
||||
y_pred = out.argmax(axis=1)
|
||||
fig, ax = plt.subplots(figsize=(10, 5))
|
||||
ConfusionMatrixDisplay.from_predictions(y, y_pred, ax=ax)
|
||||
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels(), yticklabels=ticklabels())
|
||||
plt.savefig(out_path / 'confusion_matrix.png')
|
||||
plt.close()
|
||||
breakpoint()
|
||||
from data.main import data_dir, connect
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from bias import int_to_label
|
||||
|
||||
embeddings = dataset.embeddings
|
||||
embedding_ids = dataset.data
|
||||
|
||||
DB = connect()
|
||||
query = """
|
||||
SELECT
|
||||
s.id
|
||||
,title
|
||||
,p.name
|
||||
,count(1) over (partition by publisher_id) as stories
|
||||
FROM stories s
|
||||
JOIN publishers p
|
||||
on p.id = s.publisher_id
|
||||
WHERE s.publisher_id NOT IN (
|
||||
SELECT
|
||||
id
|
||||
FROM publisher_bias b
|
||||
)
|
||||
"""
|
||||
data = DB.sql(query).df()
|
||||
embeddings = np.load(data_dir() / 'embeddings.npy')
|
||||
embedding_ids = pd.DataFrame(np.load(data_dir() / 'embedding_ids.npy'), columns=['id']).reset_index()
|
||||
|
||||
|
||||
for i in range(10):
|
||||
embedding = embeddings[embedding_ids[embedding_ids['id'] == data.iloc[i]['id']]['index']]
|
||||
title = data.iloc[i]['title']
|
||||
publisher = data.iloc[i]['name']
|
||||
class_pred = nn.functional.softmax( model(torch.tensor(embedding))).detach()
|
||||
class_id = int(torch.argmax(nn.functional.softmax( model(torch.tensor(embedding))).detach()))
|
||||
print(f"{publisher}: {int_to_label(class_id)} - \"{title}\"")
|
||||
|
||||
embedding_ids['id'] == data.iloc[0]['id']
|
||||
embedding_ids[embedding_ids['id'] == data.iloc[0]['id']]
|
||||
embedding = embeddings[embedding_ids[embedding_ids['id'] == data.iloc[0]['id']]['index']]
|
||||
title
|
||||
publisher
|
||||
|
||||
model().get_last_layer(torch.tensor(embedding))
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from torch import nn
|
||||
from transformers import BertPreTrainedModel, BertModel
|
||||
|
||||
class Classifier(nn.Module):
|
||||
def __init__(self, embedding_length: int, classes: int):
|
||||
@@ -26,3 +27,47 @@ class Classifier(nn.Module):
|
||||
def get_last_layer(self, x):
|
||||
x = self.stack(x)
|
||||
return x
|
||||
|
||||
|
||||
class BertForMultiLabelClassification(BertPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.bert = BertModel(config)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
|
||||
self.loss_fct = nn.BCEWithLogitsLoss()
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
):
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
)
|
||||
pooled_output = outputs[1]
|
||||
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
|
||||
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||
|
||||
if labels is not None:
|
||||
loss = self.loss_fct(logits, labels)
|
||||
outputs = (loss,) + outputs
|
||||
|
||||
return outputs # (loss), logits, (hidden_states), (attentions)
|
||||
|
||||
Reference in New Issue
Block a user