finish presentation.
After Width: | Height: | Size: 21 KiB |
After Width: | Height: | Size: 16 KiB |
After Width: | Height: | Size: 235 KiB |
After Width: | Height: | Size: 202 KiB |
After Width: | Height: | Size: 30 KiB |
After Width: | Height: | Size: 65 KiB |
After Width: | Height: | Size: 12 KiB |
After Width: | Height: | Size: 117 KiB |
After Width: | Height: | Size: 7.2 KiB |
After Width: | Height: | Size: 140 KiB |
After Width: | Height: | Size: 10 KiB |
|
@ -67,6 +67,7 @@ if __name__ == "__main__":
|
|||
|
||||
cli.add_command(plots.sentiment.over_time)
|
||||
cli.add_command(plots.sentiment.bias_over_time)
|
||||
cli.add_command(plots.sentiment.bias_vs_recent_winner)
|
||||
|
||||
|
||||
cli()
|
||||
|
|
|
@ -348,6 +348,7 @@ def create_elections_table():
|
|||
row_number() over() as id
|
||||
,type
|
||||
,date
|
||||
,winner
|
||||
FROM df
|
||||
""")
|
||||
|
||||
|
@ -359,6 +360,7 @@ def create_elections_table():
|
|||
,e.id as election_id
|
||||
,e.date as election_date
|
||||
,s.published_at as publish_date
|
||||
,e.winner as winner
|
||||
FROM (
|
||||
SELECT
|
||||
DISTINCT
|
||||
|
@ -373,6 +375,7 @@ def create_elections_table():
|
|||
,publish_date
|
||||
,election_date
|
||||
,election_id
|
||||
,winner
|
||||
FROM cte
|
||||
)
|
||||
SELECT
|
||||
|
@ -380,6 +383,7 @@ def create_elections_table():
|
|||
,publish_date
|
||||
,election_date
|
||||
,election_id
|
||||
,winner
|
||||
FROM windowed
|
||||
WHERE rn = 1
|
||||
""")
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
import click
|
||||
from data.main import connect
|
||||
import os
|
||||
from pathlib import Path
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
||||
|
||||
@click.command('plot:bias-hist')
|
||||
def hist():
|
||||
filename = "bias_hist.png"
|
||||
|
||||
DB = connect()
|
||||
data = DB.sql("""
|
||||
SELECT
|
||||
b.ordinal
|
||||
,count(1) as stories
|
||||
FROM stories s
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
GROUP BY
|
||||
b.ordinal
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
@click.command('plot:bias-publisher-hist')
|
||||
def publisher_hist():
|
||||
filename = "bias_publisher_hist.png"
|
||||
|
||||
DB = connect()
|
||||
data = DB.sql("""
|
||||
SELECT
|
||||
b.ordinal
|
||||
,count(1) as publishers
|
||||
FROM publisher_bias pb
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
GROUP BY
|
||||
b.ordinal
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.barplot(x=data['ordinal'], y=data['publishers'], color='tab:blue')
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="count of publishers per bias rating", xlabel="bias rating", xticklabels=ticklabels)
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
plt.close()
|
||||
print(f"saved: {filename}")
|
|
@ -115,3 +115,45 @@ def emotion_regression():
|
|||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
|
||||
@click.command('plot:emotion-hist')
|
||||
def emotion_hist():
|
||||
filename = "emotion_hist.png"
|
||||
|
||||
DB = connect()
|
||||
DB.query("""describe story_emotions""")
|
||||
|
||||
DB.query("""
|
||||
select
|
||||
e.label
|
||||
,count(distinct s.id) as stories
|
||||
,count(distinct s.publisher_id) as publishers
|
||||
from story_emotions se
|
||||
join emotions e
|
||||
on e.id = se.emotion_id
|
||||
join top.stories s
|
||||
on s.id = se.story_id
|
||||
group by
|
||||
e.label
|
||||
""").df().to_markdown(index=False)
|
||||
|
||||
data = DB.sql("""
|
||||
SELECT
|
||||
b.ordinal
|
||||
,count(1) as stories
|
||||
FROM stories s
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
GROUP BY
|
||||
b.ordinal
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
|
|
|
@ -112,3 +112,134 @@ def test():
|
|||
# .df().to_csv(data_dir / 'cluster_publishers.csv', sep="|", index=False)
|
||||
DB.close()
|
||||
|
||||
|
||||
@click.command('plot:link-confusion')
|
||||
def link_confusion():
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
|
||||
filename = "link_confusion.png"
|
||||
|
||||
DB = connect()
|
||||
bias = DB.query("""
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,b.ordinal
|
||||
FROM top.publishers p
|
||||
JOIN top.publisher_bias pb
|
||||
ON pb.publisher_id = p.id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
|
||||
df = DB.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.link_edges
|
||||
WHERE parent_id in (
|
||||
select
|
||||
publisher_id
|
||||
from bias
|
||||
)
|
||||
AND child_id in (
|
||||
select
|
||||
publisher_id
|
||||
from bias
|
||||
)
|
||||
""").df()
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
|
||||
x = pivot.values
|
||||
y = bias.sort_values('publisher_id').ordinal
|
||||
|
||||
|
||||
x_train, x_test = train_test_split(x)
|
||||
y_train, y_test = train_test_split(y)
|
||||
|
||||
model = KNeighborsClassifier(n_neighbors=5)
|
||||
model.fit(x_train, y_train)
|
||||
y_pred = model.predict(x_test)
|
||||
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 5))
|
||||
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
|
||||
plt.savefig(out_dir / filename)
|
||||
plt.close()
|
||||
print(f"saved plot: {filename}")
|
||||
|
||||
@click.command('plot:link-classifier')
|
||||
def link_confusion():
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
|
||||
filename = "link_confusion.png"
|
||||
|
||||
DB = connect()
|
||||
bias = DB.query("""
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,b.ordinal
|
||||
FROM top.publishers p
|
||||
JOIN top.publisher_bias pb
|
||||
ON pb.publisher_id = p.id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
|
||||
df = DB.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.link_edges
|
||||
WHERE parent_id in (
|
||||
select
|
||||
publisher_id
|
||||
from bias
|
||||
)
|
||||
AND child_id in (
|
||||
select
|
||||
publisher_id
|
||||
from bias
|
||||
)
|
||||
""").df()
|
||||
pivot = df.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
|
||||
x = pivot.values
|
||||
y = bias.sort_values('publisher_id').ordinal
|
||||
|
||||
data = DB.query(f"""
|
||||
SELECT
|
||||
p.id as publisher_id
|
||||
,pca.first
|
||||
,pca.second
|
||||
FROM top.publisher_pca_onehot pca
|
||||
JOIN top.publishers p
|
||||
ON pca.publisher_id = p.id
|
||||
""").df()
|
||||
|
||||
|
||||
|
||||
model = KNeighborsClassifier(n_neighbors=5)
|
||||
model.fit(x, y)
|
||||
y_pred = model.predict(x)
|
||||
|
||||
plot = bias.sort_values('publisher_id')
|
||||
plot['pred'] = y_pred
|
||||
data = pd.merge(plot, data)
|
||||
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 5))
|
||||
ConfusionMatrixDisplay.from_predictions(data['ordinal'], data['pred'], ax=ax)
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="confusion matrix for link matrix kNN classifier", xticklabels=ticklabels, yticklabels=ticklabels)
|
||||
plt.savefig(out_dir / filename)
|
||||
plt.close()
|
||||
print(f"saved plot: {filename}")
|
||||
|
||||
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['pred'])
|
||||
plt.savefig(out_dir / filename)
|
||||
plt.close()
|
||||
print(f"saved plot: {filename}")
|
||||
|
|
|
@ -8,6 +8,7 @@ import numpy as np
|
|||
import pandas as pd
|
||||
|
||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
||||
|
||||
@click.command('plot:sentiment-over-time')
|
||||
def over_time():
|
||||
filename = "sentiment_over_time.png"
|
||||
|
@ -30,6 +31,7 @@ def over_time():
|
|||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
|
||||
@click.command('plot:bias-vs-sentiment-over-time')
|
||||
def bias_over_time():
|
||||
filename = "bias_vs_sentiment_over_time.png"
|
||||
|
@ -38,8 +40,9 @@ def bias_over_time():
|
|||
data = DB.sql("""
|
||||
SELECT
|
||||
avg(sent.class_id) as sentiment
|
||||
,s.published_at as date
|
||||
,b.id as bias_id
|
||||
,date_trunc('yearweek', s.published_at) as date
|
||||
--,b.ordinal as ordinal
|
||||
,b.bias
|
||||
FROM top.story_sentiments sent
|
||||
JOIN top.stories s
|
||||
ON s.id = sent.story_id
|
||||
|
@ -48,13 +51,88 @@ def bias_over_time():
|
|||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
GROUP BY
|
||||
s.published_at
|
||||
,b.id
|
||||
date_trunc('yearweek', s.published_at)
|
||||
,b.bias
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.relplot(x=data['date'], y=data['sentiment'], col=data['bias_id'])
|
||||
ax.set(title="sentiment vs. time grouped by bias")
|
||||
order = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax = sns.relplot(data, x='date', y='sentiment', col='bias', col_order=order)
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
plt.close()
|
||||
print(f"saved: {filename}")
|
||||
|
||||
@click.command('plot:sentiment-recent-winner')
|
||||
def bias_vs_recent_winner():
|
||||
filename = "bias_vs_recent_winner.png"
|
||||
|
||||
DB = connect()
|
||||
data = DB.sql("""
|
||||
SELECT
|
||||
e.days_away as days_away
|
||||
,b.ordinal
|
||||
,avg(sent.class_id) as sentiment
|
||||
,count(1) as stories
|
||||
FROM top.stories s
|
||||
JOIN top.story_sentiments sent
|
||||
ON s.id = sent.story_id
|
||||
JOIN election_distance e
|
||||
ON e.publish_date = s.published_at
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
GROUP BY
|
||||
e.days_away
|
||||
,b.ordinal
|
||||
""").df()
|
||||
DB.close()
|
||||
data
|
||||
|
||||
ax = sns.scatterplot(x=data['days_away'], y=data['sentiment'], hue=data['ordinal'])
|
||||
ax.set(title="sentiment vs. days to nearest election", xlabel="days to nearest election", ylabel="average title seniment")
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
plt.close()
|
||||
|
||||
print(f"saved: {filename}")
|
||||
|
||||
@click.command('plot:sentiment-hist')
|
||||
def sentiment_hist():
|
||||
filename = "sentiment_hist.png"
|
||||
|
||||
DB = connect()
|
||||
|
||||
DB.query("""
|
||||
select
|
||||
sent.label
|
||||
,count(distinct s.id) as stories
|
||||
,count(distinct s.publisher_id) as publishers
|
||||
from top.story_sentiments sent
|
||||
join top.stories s
|
||||
on s.id = sent.story_id
|
||||
group by
|
||||
sent.label
|
||||
""").df().to_markdown(index=False)
|
||||
|
||||
data = DB.sql("""
|
||||
SELECT
|
||||
b.ordinal
|
||||
,count(1) as stories
|
||||
FROM stories s
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
GROUP BY
|
||||
b.ordinal
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.barplot(x=data['ordinal'], y=data['stories'], color='tab:blue')
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="count of stories per bias rating", xlabel="bias rating", xticklabels=ticklabels)
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
|
|