add exp 4/5

This commit is contained in:
matt
2023-05-17 21:38:21 -07:00
parent 74c2d8afa2
commit 3f7b3ad467
16 changed files with 905 additions and 59 deletions

View File

@@ -12,6 +12,8 @@ if __name__ == "__main__":
cli.add_command(scrape.parse)
cli.add_command(scrape.load)
cli.add_command(scrape.normalize)
cli.add_command(scrape.create_elections_table)
import word
# cli.add_command(word.distance)
# cli.add_command(word.train)
@@ -30,8 +32,11 @@ if __name__ == "__main__":
cli.add_command(emotion.normalize)
cli.add_command(emotion.analyze)
cli.add_command(emotion.create_table)
import sentence
cli.add_command(sentence.embed)
cli.add_command(sentence.create_avg_pca_table)
from train import main as train_main
cli.add_command(train_main.main)
@@ -54,4 +59,14 @@ if __name__ == "__main__":
import plots.classifier as plotc
cli.add_command(plotc.pca_with_classes)
import plots
cli.add_command(plots.sentence.sentence_pca)
cli.add_command(plots.sentence.avg_sentence_pca)
cli.add_command(plots.emotion.emotion_over_time)
cli.add_command(plots.emotion.emotion_regression)
cli.add_command(plots.sentiment.over_time)
cli.add_command(plots.sentiment.bias_over_time)
cli()

View File

@@ -335,3 +335,92 @@ def another_norm():
on sv2.id = s.id
limit 5
""")
@click.command('data:create-election-table')
def create_elections_table():
df = pd.read_csv(data_dir() / 'election_dates.csv', sep="|")
df['date'] = pd.to_datetime(df.date)
DB = connect()
DB.query("""
CREATE OR REPLACE TABLE election_dates AS
SELECT
row_number() over() as id
,type
,date
FROM df
""")
DB.query("""
CREATE OR REPLACE TABLE election_distance AS
WITH cte as (
SELECT
day(e.date - s.published_at) as days_away
,e.id as election_id
,e.date as election_date
,s.published_at as publish_date
FROM (
SELECT
DISTINCT
published_at
FROM top.stories
) s
CROSS JOIN election_dates e
) , windowed as (
SELECT
row_number() over(partition by publish_date order by abs(days_away) asc) as rn
,days_away
,publish_date
,election_date
,election_id
FROM cte
)
SELECT
days_away
,publish_date
,election_date
,election_id
FROM windowed
WHERE rn = 1
""")
DB.close()
@click.command('scrape:create-denorm')
def create_denorm():
DB = connect()
DB.sql("create schema denorm")
DB.sql("""
CREATE OR REPLACE TABLE denorm.stories AS
SELECT
s.id as story_id
,s.title
,s.url
,s.published_at
,s.author
,p.name as publisher
,p.tld as tld
,sent.class_id as sentiment
,d.days_away as election_distance
,b.ordinal as bias
,pca.first as link_1
,pca.second as link_2
,e.emotion_id as emotion
FROM top.stories s
JOIN top.publishers p
ON p.id = s.publisher_id
JOIN top.story_sentiments sent
ON s.id = sent.story_id
JOIN election_distance d
ON d.election_date = s.published_at
JOIN publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
JOIN top.publisher_pca_onehot pca
ON pca.publisher_id = p.id
JOIN story_emotions e
ON e.story_id = s.id
""")
DB.close()

View File

@@ -379,24 +379,34 @@ def debug():
def another():
DB = connect()
DB.sql("""
select
*
from emotions
""")
emotions = DB.sql("""
DB.sql("""
select
year(s.published_at) as year
,se.label as emotion
,count(1) as stories
from stories s
join story_emotions se
on s.id = se.story_id
group by
year(s.published_at)
,se.label
*
from story_emotions
""")
emotions = DB.sql("""
SELECT
YEAR(s.published_at) AS year
,e.label AS emotion
,count(1) AS stories
FROM stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
GROUP by
YEAR(s.published_at)
,e.label
""").df()
emotions
sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
plt.show()

View File

@@ -0,0 +1,9 @@
import plots.sentence
import plots.emotion
import plots.sentiment
__all__ = [
'sentence'
'emotion',
'sentiment',
]

117
src/plots/emotion.py Normal file
View File

@@ -0,0 +1,117 @@
import click
from data.main import connect
import os
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:emotion-over-time')
def emotion_over_time():
filename = "emotion_over_time.png"
DB = connect()
emotions = DB.sql("""
SELECT
date_trunc('year', s.published_at) AS year
,e.label AS emotion
,count(1) AS stories
FROM top.stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
GROUP by
date_trunc('year', s.published_at)
,e.label
""").df()
DB.close()
ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)")
plt.savefig(out_path / filename)
print(f"saved: {filename}")
@click.command('plot:emotion-regression')
def emotion_regression():
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
filename = "emotion_regression.png"
DB = connect()
emotions = DB.query("""
SELECT
label
FROM emotions e
""").df()['label'].to_list()
DB.close()
DB = connect()
df = DB.sql(f"""
SELECT
epoch(date_trunc('yearweek', s.published_at)) AS date
,e.id AS emotion_id
,p.id as publisher_id
,count(1) AS stories
FROM top.stories s
JOIN top.publishers p
ON p.id = s.publisher_id
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
GROUP by
epoch(date_trunc('yearweek', s.published_at))
,p.id
,e.id
""").df()
DB.close()
results = []
for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']):
model = linear_model.LinearRegression()
x = group['date'].to_numpy().reshape(-1, 1)
y = group['stories'].to_numpy()
model.fit(x, y)
per_year = model.coef_.item() * 60 * 60 * 24 * 365
results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year})
results = pd.DataFrame(results)
DB = connect()
out = DB.query("""
SELECT
e.label as emotion
--,p.tld
,avg(results.per_year) as avg_reg_coef
,b.ordinal
FROM results
JOIN emotions e
ON e.id = results.emotion_id
JOIN top.publishers p
ON p.id = results.publisher_id
JOIN publisher_bias pb
ON pb.publisher_id = results.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
e.label
,b.ordinal
""").df()
DB.close()
pivot = out.pivot(index=['emotion'], columns=['ordinal'], values=['avg_reg_coef'])
ax = sns.heatmap(pivot, cmap='RdBu_r')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="slope of regression (stories/year) by bias and emotion"
,xticklabels=ticklabels
,xlabel="bias"
,ylabel="emotion")
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")

111
src/plots/sentence.py Normal file
View File

@@ -0,0 +1,111 @@
import click
from data.main import connect
import os
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
@click.command('plot:sentence-pca')
def sentence_pca():
filename = "embedding_sentence_pca.png"
DB = connect()
data = DB.query("""
SELECT
pca.first
,pca.second
,b.bias as label
FROM top.story_embeddings_pca pca
JOIN top.stories s
ON s.id = pca.story_id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
DB.close()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
plt.savefig(out_path / filename)
@click.command('plot:avg-sentence-pca')
def avg_sentence_pca():
filename = "avg_embedding_sentence_pca.png"
DB = connect()
data = DB.query("""
SELECT
pca.first
,pca.second
,p.tld
,b.bias as label
FROM top.publisher_embeddings_pca pca
JOIN top.publishers p
ON p.id = pca.publisher_id
JOIN top.publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
DB.close()
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
plt.savefig(out_path / filename)
@click.command('plot:sentence-confusion')
def sentence_confusion():
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
filename = "sentence_confusion.png"
embeddings = np.load(data_path / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect()
data = DB.query("""
SELECT
ids.index
,s.id
,b.ordinal
FROM ids
JOIN top.stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
pub = DB.query("""
SELECT
*
FROM top.publishers
""").df()
DB.close()
train, test = train_test_split(data)
train_x, train_y = embeddings[train['index']], train['ordinal']
test_x, test_y = embeddings[test['index']], test['ordinal']
model = KNeighborsClassifier(n_neighbors=5)
model.fit(train_x, train_y)
pred = model.predict(test_x)
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_path / filename)
plt.close()
print(f"saved plot: {filename}")

60
src/plots/sentiment.py Normal file
View File

@@ -0,0 +1,60 @@
import click
from data.main import connect
import os
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:sentiment-over-time')
def over_time():
filename = "sentiment_over_time.png"
DB = connect()
data = DB.sql("""
SELECT
avg(sent.class_id) as sentiment
,s.published_at as date
FROM top.story_sentiments sent
JOIN top.stories s
ON s.id = sent.story_id
GROUP BY
s.published_at
""").df()
DB.close()
ax = sns.scatterplot(x=data['date'], y=data['sentiment'])
ax.set(title="sentiment vs. time")
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")
@click.command('plot:bias-vs-sentiment-over-time')
def bias_over_time():
filename = "bias_vs_sentiment_over_time.png"
DB = connect()
data = DB.sql("""
SELECT
avg(sent.class_id) as sentiment
,s.published_at as date
,b.id as bias_id
FROM top.story_sentiments sent
JOIN top.stories s
ON s.id = sent.story_id
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
s.published_at
,b.id
""").df()
DB.close()
ax = sns.relplot(x=data['date'], y=data['sentiment'], col=data['bias_id'])
ax.set(title="sentiment vs. time grouped by bias")
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")

View File

@@ -72,16 +72,71 @@ def embed(chunks):
print(f"ids saved: {save_to}")
@click.command('sentence:create-pca-table')
def create_table():
from sklearn import linear_model
@click.command('sentence:create-avg-pca-table')
def create_avg_pca_table():
from sklearn.decomposition import PCA
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy')
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
DB = connect()
DB = connect()
data = DB.query("""
SELECT
ids.index
,s.id
,s.publisher_id
,b.ordinal
FROM ids
JOIN top.stories s
ON ids.story_id = s.id
JOIN top.publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
DB.close()
results = []
for publisher_id, group in data.groupby(['publisher_id']):
avg = embeddings[group['index']].mean(axis=0)
ordinal = group['ordinal'].iloc[0]
results.append({'publisher_id' : publisher_id, 'embedding' : avg, 'ordinal' : ordinal})
results = pd.DataFrame(results)
x = np.stack(results['embedding'])
y = results['ordinal']
model = PCA(n_components=2)
pred = model.fit_transform(x)
results['first'] = pred[:, 0]
results['second'] = pred[:, 1]
table_name = "top.publisher_embeddings_pca"
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
results.publisher_id as publisher_id
,results.first as first
,results.second as second
FROM results
""")
DB.close()
print(f"created {table_name}")
@click.command('sentence:create-pca-table')
def create_pca_table():
from sklearn.decomposition import PCA
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy')
embedding_ids = np.load(data_path / 'embedding_ids.npy')
DB = connect()
data = DB.query("""
SELECT
ids.index
@@ -95,19 +150,38 @@ def create_table():
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
pub = DB.query("""
SELECT
*
FROM top.publishers
""").df()
DB.close()
x = embeddings[data['index']]
y = data['ordinal'].to_numpy().reshape(-1, 1)
model = PCA(n_components=2)
pred = model.fit_transform(x)
data['first'] = pred[:, 0]
data['second'] = pred[:, 1]
reg = linear_model.LinearRegression()
table_name = f"top.story_embeddings_pca"
reg.fit(x, y)
reg.coef_.shape
DB = connect()
DB.query(f"""
CREATE OR REPLACE TABLE {table_name} AS
SELECT
data.id as story_id
,data.first as first
,data.second as second
FROM data
""")
DB.close()
print(f"created {table_name}")
@click.command('sentence:create-svm-table')
def create_svm_table():
from sklearn import svm
from sklearn.linear_model import SGDClassifier
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
embeddings = np.load(data_path / 'embeddings.npy')
@@ -133,6 +207,8 @@ def create_svm_table():
#y = data['ordinal'].to_numpy().reshape(-1, 1)
y = data['ordinal']
clf = svm.SVC()
pred = clf.fit(x, y)
model = SGDClassifier()
pred = model.fit(x, y)
data['pred'] = pred.predict(x)
data

86
src/sentiment.py Normal file
View File

@@ -0,0 +1,86 @@
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.nn.functional as F
from data import connect, data_dir
import numpy as np
from tqdm import tqdm
import click
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
@click.command("sentiment:extract")
def extract(chunks):
device = 'cuda' if torch.cuda.is_available() else 'cpu'
chunks = 1000
# Load model from HuggingFace Hub
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
model = model.to(device)
# load data
DB = connect()
table = DB.sql("""
select
id
,title
from stories
order by id desc
""").df()
DB.close()
# normalize text
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
chunked = np.array_split(table, chunks)
# generate embeddings from list of titles
iterator = tqdm(chunked, 'embedding')
sentiments = []
story_ids = []
for _, chunk in enumerate(iterator):
sentences = chunk['title'].tolist()
ids = chunk['id'].tolist()
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings
with torch.no_grad():
logits = model(**encoded_input.to(device)).logits
sentiment = logits.argmax(axis=1).tolist()
sentiments.append(sentiment)
story_ids.append(ids)
sentiments = np.concatenate(sentiments)
story_ids = np.concatenate(story_ids)
# save embeddings
save_to = data_dir() / 'sentiment.npy'
np.save(save_to, sentiments)
print(f"sentiments saved: {save_to}")
# save ids
save_to = data_dir() / 'sentiment_ids.npy'
np.save(save_to, story_ids)
print(f"ids saved: {save_to}")
@click.command('sentiment:load')
def load():
DB = connect()
sentiments = np.load(data_dir() / 'sentiment.npy')
story_ids = np.load(data_dir() / 'sentiment_ids.npy')
data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
data['sentiment_id'] = sentiments
DB.query("""
CREATE OR REPLACE TABLE top.story_sentiments AS
SELECT
data.story_id
,data.sentiment_id as class_id
,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
FROM data
JOIN top.stories s
ON s.id = data.story_id
""")
DB.close()