finish presentation.

This commit is contained in:
matt 2023-05-18 19:55:15 -07:00
parent 3f7b3ad467
commit 398228f02c
18 changed files with 975 additions and 184 deletions

BIN
docs/figures/bias_hist.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 235 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 202 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 117 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 140 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

File diff suppressed because it is too large Load Diff

View File

@ -67,6 +67,7 @@ if __name__ == "__main__":
cli.add_command(plots.sentiment.over_time) cli.add_command(plots.sentiment.over_time)
cli.add_command(plots.sentiment.bias_over_time) cli.add_command(plots.sentiment.bias_over_time)
cli.add_command(plots.sentiment.bias_vs_recent_winner)
cli() cli()

View File

@ -348,6 +348,7 @@ def create_elections_table():
row_number() over() as id row_number() over() as id
,type ,type
,date ,date
,winner
FROM df FROM df
""") """)
@ -359,6 +360,7 @@ def create_elections_table():
,e.id as election_id ,e.id as election_id
,e.date as election_date ,e.date as election_date
,s.published_at as publish_date ,s.published_at as publish_date
,e.winner as winner
FROM ( FROM (
SELECT SELECT
DISTINCT DISTINCT
@ -373,6 +375,7 @@ def create_elections_table():
,publish_date ,publish_date
,election_date ,election_date
,election_id ,election_id
,winner
FROM cte FROM cte
) )
SELECT SELECT
@ -380,6 +383,7 @@ def create_elections_table():
,publish_date ,publish_date
,election_date ,election_date
,election_id ,election_id
,winner
FROM windowed FROM windowed
WHERE rn = 1 WHERE rn = 1
""") """)

60
src/plots/bias.py Normal file
View File

@ -0,0 +1,60 @@
import click
from data.main import connect
import os
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:bias-hist')
def hist():
filename = "bias_hist.png"
DB = connect()
data = DB.sql("""
SELECT
b.ordinal
,count(1) as stories
FROM stories s
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
b.ordinal
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")
@click.command('plot:bias-publisher-hist')
def publisher_hist():
filename = "bias_publisher_hist.png"
DB = connect()
data = DB.sql("""
SELECT
b.ordinal
,count(1) as publishers
FROM publisher_bias pb
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
b.ordinal
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels)
plt.tight_layout()
plt.savefig(out_path / filename)
plt.close()
print(f"saved: {filename}")

View File

@ -115,3 +115,45 @@ def emotion_regression():
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(out_path / filename)
print(f"saved: {filename}") print(f"saved: {filename}")
@click.command('plot:emotion-hist')
def emotion_hist():
filename = "emotion_hist.png"
DB = connect()
DB.query("""describe story_emotions""")
DB.query("""
select
e.label
,count(distinct s.id) as stories
,count(distinct s.publisher_id) as publishers
from story_emotions se
join emotions e
on e.id = se.emotion_id
join top.stories s
on s.id = se.story_id
group by
e.label
""").df().to_markdown(index=False)
data = DB.sql("""
SELECT
b.ordinal
,count(1) as stories
FROM stories s
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
b.ordinal
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
plt.tight_layout()
plt.savefig(out_path / filename)
print(f"saved: {filename}")

View File

@ -112,3 +112,134 @@ def test():
# .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False) # .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
DB.close() DB.close()
@click.command('plot:link-confusion')
def link_confusion():
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
filename = "link_confusion.png"
DB = connect()
bias = DB.query("""
SELECT
p.id as publisher_id
,b.ordinal
FROM top.publishers p
JOIN top.publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
df = DB.query("""
SELECT
*
FROM top.link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
x = pivot.values
y = bias.sort_values('publisher_id').ordinal
x_train, x_test = train_test_split(x)
y_train, y_test = train_test_split(y)
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_dir / filename)
plt.close()
print(f"saved plot: {filename}")
@click.command('plot:link-classifier')
def link_confusion():
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay
filename = "link_confusion.png"
DB = connect()
bias = DB.query("""
SELECT
p.id as publisher_id
,b.ordinal
FROM top.publishers p
JOIN top.publisher_bias pb
ON pb.publisher_id = p.id
JOIN bias_ratings b
ON b.id = pb.bias_id
""").df()
df = DB.query("""
SELECT
*
FROM top.link_edges
WHERE parent_id in (
select
publisher_id
from bias
)
AND child_id in (
select
publisher_id
from bias
)
""").df()
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
x = pivot.values
y = bias.sort_values('publisher_id').ordinal
data = DB.query(f"""
SELECT
p.id as publisher_id
,pca.first
,pca.second
FROM top.publisher_pca_onehot pca
JOIN top.publishers p
ON pca.publisher_id = p.id
""").df()
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x, y)
y_pred = model.predict(x)
plot = bias.sort_values('publisher_id')
plot['pred'] = y_pred
data = pd.merge(plot, data)
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
plt.savefig(out_dir / filename)
plt.close()
print(f"saved plot: {filename}")
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
plt.savefig(out_dir / filename)
plt.close()
print(f"saved plot: {filename}")

View File

@ -8,6 +8,7 @@ import numpy as np
import pandas as pd import pandas as pd
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures' out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
@click.command('plot:sentiment-over-time') @click.command('plot:sentiment-over-time')
def over_time(): def over_time():
filename = "sentiment_over_time.png" filename = "sentiment_over_time.png"
@ -30,6 +31,7 @@ def over_time():
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(out_path / filename)
print(f"saved: {filename}") print(f"saved: {filename}")
@click.command('plot:bias-vs-sentiment-over-time') @click.command('plot:bias-vs-sentiment-over-time')
def bias_over_time(): def bias_over_time():
filename = "bias_vs_sentiment_over_time.png" filename = "bias_vs_sentiment_over_time.png"
@ -38,8 +40,9 @@ def bias_over_time():
data = DB.sql(""" data = DB.sql("""
SELECT SELECT
avg(sent.class_id) as sentiment avg(sent.class_id) as sentiment
,s.published_at as date ,date_trunc('yearweek', s.published_at) as date
,b.id as bias_id --,b.ordinal as ordinal
,b.bias
FROM top.story_sentiments sent FROM top.story_sentiments sent
JOIN top.stories s JOIN top.stories s
ON s.id = sent.story_id ON s.id = sent.story_id
@ -48,13 +51,88 @@ def bias_over_time():
JOIN bias_ratings b JOIN bias_ratings b
ON b.id = pb.bias_id ON b.id = pb.bias_id
GROUP BY GROUP BY
s.published_at date_trunc('yearweek', s.published_at)
,b.id ,b.bias
""").df() """).df()
DB.close() DB.close()
ax = sns.relplot(x=data['date'], y=data['sentiment'], col=data['bias_id']) order = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="sentiment vs. time grouped by bias") ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order)
plt.tight_layout()
plt.savefig(out_path / filename)
plt.close()
print(f"saved: {filename}")
@click.command('plot:sentiment-recent-winner')
def bias_vs_recent_winner():
filename = "bias_vs_recent_winner.png"
DB = connect()
data = DB.sql("""
SELECT
e.days_away as days_away
,b.ordinal
,avg(sent.class_id) as sentiment
,count(1) as stories
FROM top.stories s
JOIN top.story_sentiments sent
ON s.id = sent.story_id
JOIN election_distance e
ON e.publish_date = s.published_at
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
e.days_away
,b.ordinal
""").df()
DB.close()
data
ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal'])
ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
plt.tight_layout()
plt.savefig(out_path / filename)
plt.close()
print(f"saved: {filename}")
@click.command('plot:sentiment-hist')
def sentiment_hist():
filename = "sentiment_hist.png"
DB = connect()
DB.query("""
select
sent.label
,count(distinct s.id) as stories
,count(distinct s.publisher_id) as publishers
from top.story_sentiments sent
join top.stories s
on s.id = sent.story_id
group by
sent.label
""").df().to_markdown(index=False)
data = DB.sql("""
SELECT
b.ordinal
,count(1) as stories
FROM stories s
JOIN publisher_bias pb
ON pb.publisher_id = s.publisher_id
JOIN bias_ratings b
ON b.id = pb.bias_id
GROUP BY
b.ordinal
""").df()
DB.close()
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
plt.tight_layout() plt.tight_layout()
plt.savefig(out_path / filename) plt.savefig(out_path / filename)
print(f"saved: {filename}") print(f"saved: {filename}")