add exp 4/5
This commit is contained in:
parent
74c2d8afa2
commit
3f7b3ad467
Binary file not shown.
After Width: | Height: | Size: 51 KiB |
Binary file not shown.
After Width: | Height: | Size: 275 KiB |
Binary file not shown.
After Width: | Height: | Size: 44 KiB |
Binary file not shown.
After Width: | Height: | Size: 30 KiB |
Binary file not shown.
After Width: | Height: | Size: 44 KiB |
Binary file not shown.
After Width: | Height: | Size: 128 KiB |
|
@ -54,6 +54,11 @@ allsides.com <!-- .element: class="fragment" -->
|
|||
|
||||
huggingface.com <!-- .element: class="fragment" -->
|
||||
|
||||
Note:
|
||||
Let's get a handle on the shape of the data.
|
||||
|
||||
The sources, size, and features of the data.
|
||||
|
||||
===
|
||||
|
||||
<section data-background-iframe="https://www.memeorandum.com" data-background-interactive></section>
|
||||
|
@ -128,7 +133,7 @@ huggingface.com <!-- .element: class="fragment" -->
|
|||
===
|
||||
|
||||
# Data Structures
|
||||
Stories
|
||||
## Stories
|
||||
|
||||
- Top level stories. <!-- .element: class="fragment" -->
|
||||
- title.
|
||||
|
@ -142,7 +147,7 @@ Stories
|
|||
==
|
||||
|
||||
# Data Structures
|
||||
Bias
|
||||
## Bias
|
||||
|
||||
- Per publisher. <!-- .element: class="fragment" -->
|
||||
- name.
|
||||
|
@ -153,7 +158,7 @@ Bias
|
|||
==
|
||||
|
||||
# Data Structures
|
||||
Embeddings
|
||||
## Embeddings
|
||||
|
||||
- Per story title. <!-- .element: class="fragment" -->
|
||||
- sentence embedding (n, 384).
|
||||
|
@ -169,7 +174,7 @@ Embeddings
|
|||
|
||||
# Data Collection
|
||||
|
||||
Story Scraper (simplified)
|
||||
## Story Scraper (simplified)
|
||||
|
||||
```python
|
||||
day = timedelta(days=1)
|
||||
|
@ -187,7 +192,8 @@ while cur <= end:
|
|||
==
|
||||
|
||||
# Data Collection
|
||||
Bias Scraper (hard)
|
||||
|
||||
## Bias Scraper (hard)
|
||||
|
||||
```python
|
||||
...
|
||||
|
@ -206,14 +212,16 @@ for row in rows:
|
|||
==
|
||||
|
||||
# Data Collection
|
||||
Bias Scraper (easy)
|
||||
|
||||
## Bias Scraper (easy)
|
||||
|
||||
![allsides request](https://studentweb.cs.wwu.edu/~jensen33/static/577/allsides_request.png)
|
||||
|
||||
==
|
||||
|
||||
# Data Collection
|
||||
Embeddings (easy)
|
||||
|
||||
## Embeddings (easy)
|
||||
|
||||
```python
|
||||
# table = ...
|
||||
|
@ -230,7 +238,8 @@ for chunk in table:
|
|||
==
|
||||
|
||||
# Data Collection
|
||||
Classification Embeddings (medium)
|
||||
|
||||
## Classification Embeddings (medium)
|
||||
|
||||
```python
|
||||
...
|
||||
|
@ -249,7 +258,8 @@ for i, class_id in enumerate(class_ids):
|
|||
==
|
||||
|
||||
# Data Selection
|
||||
Stories
|
||||
|
||||
## Stories
|
||||
|
||||
- Clip the first and last full year of stories. <!-- .element: class="fragment" -->
|
||||
- Remove duplicate stories (big stories span multiple days). <!-- .element: class="fragment" -->
|
||||
|
@ -257,7 +267,7 @@ Stories
|
|||
==
|
||||
# Data Selection
|
||||
|
||||
Publishers
|
||||
## Publishers
|
||||
|
||||
- Combine subdomains of stories. <!-- .element: class="fragment" -->
|
||||
- blog.washingtonpost.com and washingtonpost.com are considered the same publisher.
|
||||
|
@ -267,7 +277,7 @@ Publishers
|
|||
|
||||
# Data Selection
|
||||
|
||||
Links
|
||||
## Links
|
||||
|
||||
- Select only stories with publishers whose story had been a 'parent' ('original publishers'). <!-- .element: class="fragment" -->
|
||||
- Eliminates small blogs and non-original news.
|
||||
|
@ -279,7 +289,7 @@ Links
|
|||
|
||||
# Data Selection
|
||||
|
||||
Bias
|
||||
## Bias
|
||||
|
||||
- Keep all ratings, even ones with low agree/disagree ratio.
|
||||
- Join datasets on publisher name.
|
||||
|
@ -292,7 +302,7 @@ Bias
|
|||
|
||||
# Descriptive Stats
|
||||
|
||||
Raw
|
||||
## Raw
|
||||
|
||||
| metric | value |
|
||||
|:------------------|--------:|
|
||||
|
@ -307,7 +317,7 @@ Raw
|
|||
==
|
||||
# Descriptive Stats
|
||||
|
||||
Stories Per Publisher
|
||||
## Stories Per Publisher
|
||||
|
||||
![stories per publisher](/static/577/stories_per_publisher.png)
|
||||
|
||||
|
@ -315,7 +325,7 @@ Stories Per Publisher
|
|||
|
||||
# Descriptive Stats
|
||||
|
||||
Top Publishers
|
||||
## Top Publishers
|
||||
|
||||
![top publishers](https://studentweb.cs.wwu.edu/~jensen33/static/577/top_publishers.png)
|
||||
|
||||
|
@ -323,7 +333,7 @@ Top Publishers
|
|||
|
||||
# Descriptive Stats
|
||||
|
||||
Articles Per Year
|
||||
## Articles Per Year
|
||||
|
||||
![articles per year](https://studentweb.cs.wwu.edu/~jensen33/static/577/articles_per_year.png)
|
||||
|
||||
|
@ -331,7 +341,7 @@ Articles Per Year
|
|||
|
||||
# Descriptive Stats
|
||||
|
||||
Common TLDs
|
||||
## Common TLDs
|
||||
|
||||
![common tlds](https://studentweb.cs.wwu.edu/~jensen33/static/577/common_tld.png)
|
||||
|
||||
|
@ -339,9 +349,9 @@ Common TLDs
|
|||
|
||||
# Descriptive Stats
|
||||
|
||||
Post Process
|
||||
## Post Process
|
||||
|
||||
| key | value |
|
||||
| metric | value |
|
||||
|:------------------|--------:|
|
||||
| total stories | 251553 |
|
||||
| total related | 815183 |
|
||||
|
@ -352,6 +362,7 @@ Post Process
|
|||
| top level domains | 234 |
|
||||
|
||||
===
|
||||
|
||||
# Experiments
|
||||
|
||||
1. **clustering** on link similarity. <!-- .element: class="fragment" -->
|
||||
|
@ -361,9 +372,16 @@ Post Process
|
|||
5. **regression** on emotional classification over time and publication. <!-- .element: class="fragment" -->
|
||||
|
||||
===
|
||||
|
||||
# Experiment 1
|
||||
|
||||
Setup
|
||||
**clustering** on link similarity.
|
||||
|
||||
==
|
||||
|
||||
# Experiment 1
|
||||
|
||||
## Setup
|
||||
|
||||
- Create one-hot encoding of links between publishers. <!-- .element: class="fragment" -->
|
||||
- Cluster the encoding. <!-- .element: class="fragment" -->
|
||||
|
@ -379,7 +397,7 @@ Principle Component Analysis:
|
|||
|
||||
# Experiment 1
|
||||
|
||||
One Hot Encoding
|
||||
## One Hot Encoding
|
||||
|
||||
| publisher | nytimes| wsj| newsweek| ...|
|
||||
|:----------|--------:|----:|--------:|----:|
|
||||
|
@ -392,7 +410,7 @@ One Hot Encoding
|
|||
|
||||
# Experiment 1
|
||||
|
||||
n-Hot Encoding
|
||||
## n-Hot Encoding
|
||||
|
||||
| publisher | nytimes| wsj| newsweek| ...|
|
||||
|:----------|--------:|----:|--------:|----:|
|
||||
|
@ -405,7 +423,7 @@ n-Hot Encoding
|
|||
|
||||
# Experiment 1
|
||||
|
||||
Normalized n-Hot Encoding
|
||||
## Normalized n-Hot Encoding
|
||||
|
||||
| publisher | nytimes| wsj| newsweek| ...|
|
||||
|:----------|--------:|----:|--------:|----:|
|
||||
|
@ -418,7 +436,7 @@ Normalized n-Hot Encoding
|
|||
|
||||
# Experiment 1
|
||||
|
||||
Elbow criterion
|
||||
## Elbow criterion
|
||||
|
||||
![elbow](https://studentweb.cs.wwu.edu/~jensen33/static/577/link_cluster_elbow.png)
|
||||
|
||||
|
@ -434,7 +452,7 @@ Percentage of variance explained is the ratio of the between-group variance to t
|
|||
|
||||
# Experiment 1
|
||||
|
||||
Link Magnitude
|
||||
## Link Magnitude
|
||||
|
||||
![link magnitude cluster](https://studentweb.cs.wwu.edu/~jensen33/static/577/link_pca_clusters_links.png)
|
||||
|
||||
|
@ -442,7 +460,7 @@ Link Magnitude
|
|||
|
||||
# Experiment 1
|
||||
|
||||
Normalized
|
||||
## Normalized
|
||||
|
||||
![link normalized cluster](https://studentweb.cs.wwu.edu/~jensen33/static/577/link_pca_clusters_normalized.png)
|
||||
|
||||
|
@ -450,7 +468,7 @@ Normalized
|
|||
|
||||
# Experiment 1
|
||||
|
||||
Onehot
|
||||
## One Hot
|
||||
|
||||
![link onehot cluster](https://studentweb.cs.wwu.edu/~jensen33/static/577/link_pca_clusters_onehot.png)
|
||||
|
||||
|
@ -458,20 +476,27 @@ Onehot
|
|||
|
||||
# Experiment 1
|
||||
|
||||
Discussion
|
||||
## Discussion
|
||||
|
||||
- Best encoding: One hot. <!-- .element: class="fragment" -->
|
||||
- Clusters based on total links otherwise.
|
||||
- Clusters, but no explanation
|
||||
- Limitation: need the link encoding to cluster.
|
||||
- Clusters, but no explanation. <!-- .element: class="fragment" -->
|
||||
- Limitation: need the link encoding to cluster. <!-- .element: class="fragment" -->
|
||||
- Smaller publishers might not link very much.
|
||||
- TODO: Association Rule Mining. <!-- .element: class="fragment" -->
|
||||
|
||||
===
|
||||
|
||||
# Experiment 2
|
||||
|
||||
Setup
|
||||
**classification** on link similarity.
|
||||
|
||||
==
|
||||
|
||||
# Experiment 2
|
||||
|
||||
## Setup
|
||||
|
||||
- **clustering**. <!-- .element: class="fragment" -->
|
||||
- Create features. <!-- .element: class="fragment" -->:
|
||||
- Publisher frequency.
|
||||
- Reuse link encodings.
|
||||
|
@ -483,7 +508,8 @@ Note:
|
|||
|
||||
==
|
||||
# Experiment 2
|
||||
Descriptive stats
|
||||
|
||||
## Descriptive stats
|
||||
|
||||
| metric | value |
|
||||
|:------------|:----------|
|
||||
|
@ -498,7 +524,7 @@ Descriptive stats
|
|||
|
||||
# Experiment 2
|
||||
|
||||
PCA + Labels
|
||||
## PCA + Labels
|
||||
|
||||
![pca vs. bias labels](https://studentweb.cs.wwu.edu/~jensen33/static/577/pca_with_classes.png)
|
||||
|
||||
|
@ -506,7 +532,7 @@ PCA + Labels
|
|||
|
||||
# Experiment 2
|
||||
|
||||
Discussion
|
||||
## Discussion
|
||||
|
||||
- Link encodings (and their PCA) are useful. <!-- .element: class="fragment" -->
|
||||
- Labels are (sort of) separated and clustered.
|
||||
|
@ -515,7 +541,7 @@ Discussion
|
|||
|
||||
# Experiment 2
|
||||
|
||||
Limitations
|
||||
## Limitations
|
||||
|
||||
- Dependent on accurate rating. <!-- .element: class="fragment" -->
|
||||
- Ordinal ratings not available. <!-- .element: class="fragment" -->
|
||||
|
@ -527,11 +553,258 @@ Limitations
|
|||
|
||||
# Experiment 3
|
||||
|
||||
Setup
|
||||
**classification** on sentence embedding.
|
||||
|
||||
==
|
||||
|
||||
# Limitations
|
||||
# Experiment 3
|
||||
|
||||
## Setup
|
||||
|
||||
|
||||
- **classification**. <!-- .element: class="fragment" -->
|
||||
- Generate sentence embedding for each title. <!-- .element: class="fragment" -->
|
||||
- Rerun PCA analysis on title embeddings. <!-- .element: class="fragment" -->
|
||||
- Use kNN classifier to map embedding features to bias rating. <!-- .element: class="fragment" -->
|
||||
|
||||
==
|
||||
|
||||
# Experiment 3
|
||||
|
||||
## Sentence Embeddings
|
||||
|
||||
1. Extract titles.
|
||||
2. Tokenize titles.
|
||||
3. Pick pretrained Language Model.
|
||||
4. Generate embeddings from tokens.
|
||||
|
||||
==
|
||||
|
||||
# Experiment 3
|
||||
|
||||
## Tokens
|
||||
|
||||
**The sentence:**
|
||||
|
||||
"Spain, Land of 10 P.M. Dinners, Asks if It's Time to Reset Clock"
|
||||
|
||||
**Tokenizes to:**
|
||||
|
||||
```
|
||||
['[CLS]', 'spain', ',', 'land', 'of', '10', 'p', '.', 'm', '.',
|
||||
'dinners', ',', 'asks', 'if', 'it', "'", 's', 'time', 'to',
|
||||
'reset', 'clock', '[SEP]']
|
||||
```
|
||||
|
||||
Note:
|
||||
[CLS] is unique to BERT models and stands for classification.
|
||||
|
||||
==
|
||||
|
||||
# Experiment 3
|
||||
|
||||
## Tokens
|
||||
|
||||
**The sentence:**
|
||||
|
||||
"NPR/PBS NewsHour/Marist Poll Results and Analysis"
|
||||
|
||||
**Tokenizes to:**
|
||||
|
||||
```
|
||||
['[CLS]', 'npr', '/', 'pbs', 'news', '##ho', '##ur', '/', 'maris',
|
||||
'##t', 'poll', 'results', 'and', 'analysis', '[SEP]', '[PAD]',
|
||||
'[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
|
||||
```
|
||||
|
||||
Note:
|
||||
The padding is there to make all tokenized vectors equal length.
|
||||
|
||||
The tokenizer also outputs a mask vector that the language model uses to ignore the padding.
|
||||
|
||||
==
|
||||
|
||||
# Experiment 3
|
||||
|
||||
## Embeddings
|
||||
|
||||
- Using a BERT (Bidirectional Encoder Representations from Transformers) based model.
|
||||
- Input: tokens.
|
||||
- Output: dense vectors representing 'semantic meaning' of tokens.
|
||||
|
||||
==
|
||||
|
||||
# Experiment 3
|
||||
|
||||
## Embeddings
|
||||
|
||||
**The tokens:**
|
||||
|
||||
```
|
||||
['[CLS]', 'npr', '/', 'pbs', 'news', '##ho', '##ur', '/', 'maris',
|
||||
'##t', 'poll', 'results', 'and', 'analysis', '[SEP]', '[PAD]',
|
||||
'[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
|
||||
```
|
||||
|
||||
**Embeds to a vector (1, 384):**
|
||||
|
||||
```
|
||||
array([[ 0.12444635, -0.05962477, -0.00127911, ..., 0.13943022,
|
||||
-0.2552534 , -0.00238779],
|
||||
[ 0.01535596, -0.05933844, -0.0099495 , ..., 0.48110735,
|
||||
0.1370568 , 0.3285091 ],
|
||||
[ 0.2831368 , -0.4200529 , 0.10879617, ..., 0.15663117,
|
||||
-0.29782432, 0.4289513 ],
|
||||
...,
|
||||
```
|
||||
==
|
||||
|
||||
# Experiment 3
|
||||
|
||||
## Results
|
||||
|
||||
![pca vs. classes](https://studentweb.cs.wwu.edu/~jensen33/static/577/embedding_sentence_pca.png)
|
||||
|
||||
Note:
|
||||
Not a lot of information in PCA this time.
|
||||
|
||||
==
|
||||
|
||||
# Experiment 3
|
||||
|
||||
## Results
|
||||
|
||||
![pca vs. avg embedding](https://studentweb.cs.wwu.edu/~jensen33/static/577/avg_embedding_sentence_pca.png) <!-- .element: class="r-stretch" -->
|
||||
|
||||
|
||||
Note:
|
||||
What about average publisher embedding?
|
||||
|
||||
==
|
||||
|
||||
# Experiment 3
|
||||
|
||||
## Results
|
||||
|
||||
![knn embedding confusion](https://studentweb.cs.wwu.edu/~jensen33/static/577/sentence_confusion.png)
|
||||
|
||||
Note:
|
||||
Trained a kNN from sklearn.
|
||||
|
||||
Set aside 20% of the data as a test set.
|
||||
|
||||
Once trained, compared the predictions with the true on the test set.
|
||||
|
||||
==
|
||||
|
||||
# Experiment 3
|
||||
|
||||
## Discussion
|
||||
|
||||
- Embedding space is hard to condense with PCA. <!-- .element: class="fragment" -->
|
||||
- Maybe the classifier is learning to guess 'left-ish'? <!-- .element: class="fragment" -->
|
||||
|
||||
===
|
||||
|
||||
# Experiment 4
|
||||
|
||||
**classification** on sentiment analysis.
|
||||
|
||||
==
|
||||
# Experiment 4
|
||||
|
||||
## Setup
|
||||
|
||||
- Use pretrained Language Classifier. <!-- .element: class="fragment" -->
|
||||
- Previously: Mapped twitter posts to tokens, to embedding, to ['positive', 'negative'] labels. <!-- .element: class="fragment" -->
|
||||
- Predict: rate of neutral titles decreasing over time.
|
||||
|
||||
==
|
||||
|
||||
# Experiment 4
|
||||
|
||||
## Results
|
||||
|
||||
![sentiment over time](https://studentweb.cs.wwu.edu/~jensen33/static/577/sentiment_over_time.png)
|
||||
|
||||
==
|
||||
# Experiment 4
|
||||
|
||||
## Results
|
||||
|
||||
![bias vs. sentiment over time](https://studentweb.cs.wwu.edu/~jensen33/static/577/bias_vs_sentiment_over_time.png)
|
||||
|
||||
==
|
||||
|
||||
# Experiment 4
|
||||
|
||||
## Discussion
|
||||
|
||||
-
|
||||
|
||||
===
|
||||
|
||||
# Experiment 5
|
||||
|
||||
**regression** on emotional classification over time and publication.
|
||||
|
||||
==
|
||||
# Experiment 5
|
||||
|
||||
## Setup
|
||||
|
||||
- Use pretrained language classifier. <!-- .element: class="fragment" -->
|
||||
- Previously: Mapped reddit posts to tokens, to embedding, to emotion labels. <!-- .element: class="fragment" -->
|
||||
- Predict: rate of neutral titles decreasing over time.
|
||||
- Classify:
|
||||
- features: emotional labels
|
||||
- labels: bias
|
||||
|
||||
==
|
||||
|
||||
# Experiment 5
|
||||
|
||||
## Results
|
||||
|
||||
![emotion over time](https://studentweb.cs.wwu.edu/~jensen33/static/577/emotion_over_time.png)
|
||||
|
||||
==
|
||||
|
||||
# Experiment 5
|
||||
|
||||
## Results
|
||||
|
||||
![emotion regression time](https://studentweb.cs.wwu.edu/~jensen33/static/577/emotion_regression.png)
|
||||
|
||||
==
|
||||
|
||||
# Experiment 5
|
||||
|
||||
## Discussion
|
||||
|
||||
- Neutral story titles dominate the dataset. <!-- .element: class="fragment" -->
|
||||
- Increase in stories published might explain most of the trend. <!-- .element: class="fragment" -->
|
||||
- Far-right and far-left both became less neutral. <!-- .element: class="fragment" -->
|
||||
- Left-Center and right-center became more emotional, but also neutral. <!-- .element: class="fragment" -->
|
||||
- Not a lot of movement overall. <!-- .element: class="fragment" -->
|
||||
|
||||
===
|
||||
|
||||
# Experiment 6 (**TODO**)
|
||||
|
||||
## Setup
|
||||
|
||||
- Have a lot of features now. <!-- .element: class="fragment" -->
|
||||
- Link PCA components.
|
||||
- Embedding PCA components.
|
||||
- Sentiment.
|
||||
- Emotion.
|
||||
- Can we predict with all of them: Bias. <!-- .element: class="fragment" -->
|
||||
- End user: Is that useful? Where will I get all that at inference time? <!-- .element: class="fragment" -->
|
||||
|
||||
===
|
||||
|
||||
# Overall Limitations
|
||||
|
||||
- Many different authors under the same publisher. <!-- .element: class="fragment" -->
|
||||
- Publishers use syndication. <!-- .element: class="fragment" -->
|
||||
|
|
15
src/cli.py
15
src/cli.py
|
@ -12,6 +12,8 @@ if __name__ == "__main__":
|
|||
cli.add_command(scrape.parse)
|
||||
cli.add_command(scrape.load)
|
||||
cli.add_command(scrape.normalize)
|
||||
cli.add_command(scrape.create_elections_table)
|
||||
|
||||
import word
|
||||
# cli.add_command(word.distance)
|
||||
# cli.add_command(word.train)
|
||||
|
@ -30,8 +32,11 @@ if __name__ == "__main__":
|
|||
cli.add_command(emotion.normalize)
|
||||
cli.add_command(emotion.analyze)
|
||||
cli.add_command(emotion.create_table)
|
||||
|
||||
import sentence
|
||||
cli.add_command(sentence.embed)
|
||||
cli.add_command(sentence.create_avg_pca_table)
|
||||
|
||||
from train import main as train_main
|
||||
cli.add_command(train_main.main)
|
||||
|
||||
|
@ -54,4 +59,14 @@ if __name__ == "__main__":
|
|||
import plots.classifier as plotc
|
||||
cli.add_command(plotc.pca_with_classes)
|
||||
|
||||
import plots
|
||||
cli.add_command(plots.sentence.sentence_pca)
|
||||
cli.add_command(plots.sentence.avg_sentence_pca)
|
||||
cli.add_command(plots.emotion.emotion_over_time)
|
||||
cli.add_command(plots.emotion.emotion_regression)
|
||||
|
||||
cli.add_command(plots.sentiment.over_time)
|
||||
cli.add_command(plots.sentiment.bias_over_time)
|
||||
|
||||
|
||||
cli()
|
||||
|
|
|
@ -335,3 +335,92 @@ def another_norm():
|
|||
on sv2.id = s.id
|
||||
limit 5
|
||||
""")
|
||||
|
||||
@click.command('data:create-election-table')
|
||||
def create_elections_table():
|
||||
df = pd.read_csv(data_dir() / 'election_dates.csv', sep="|")
|
||||
df['date'] = pd.to_datetime(df.date)
|
||||
|
||||
DB = connect()
|
||||
DB.query("""
|
||||
CREATE OR REPLACE TABLE election_dates AS
|
||||
SELECT
|
||||
row_number() over() as id
|
||||
,type
|
||||
,date
|
||||
FROM df
|
||||
""")
|
||||
|
||||
DB.query("""
|
||||
CREATE OR REPLACE TABLE election_distance AS
|
||||
WITH cte as (
|
||||
SELECT
|
||||
day(e.date - s.published_at) as days_away
|
||||
,e.id as election_id
|
||||
,e.date as election_date
|
||||
,s.published_at as publish_date
|
||||
FROM (
|
||||
SELECT
|
||||
DISTINCT
|
||||
published_at
|
||||
FROM top.stories
|
||||
) s
|
||||
CROSS JOIN election_dates e
|
||||
) , windowed as (
|
||||
SELECT
|
||||
row_number() over(partition by publish_date order by abs(days_away) asc) as rn
|
||||
,days_away
|
||||
,publish_date
|
||||
,election_date
|
||||
,election_id
|
||||
FROM cte
|
||||
)
|
||||
SELECT
|
||||
days_away
|
||||
,publish_date
|
||||
,election_date
|
||||
,election_id
|
||||
FROM windowed
|
||||
WHERE rn = 1
|
||||
""")
|
||||
|
||||
DB.close()
|
||||
|
||||
@click.command('scrape:create-denorm')
|
||||
def create_denorm():
|
||||
|
||||
DB = connect()
|
||||
DB.sql("create schema denorm")
|
||||
DB.sql("""
|
||||
CREATE OR REPLACE TABLE denorm.stories AS
|
||||
SELECT
|
||||
s.id as story_id
|
||||
,s.title
|
||||
,s.url
|
||||
,s.published_at
|
||||
,s.author
|
||||
,p.name as publisher
|
||||
,p.tld as tld
|
||||
,sent.class_id as sentiment
|
||||
,d.days_away as election_distance
|
||||
,b.ordinal as bias
|
||||
,pca.first as link_1
|
||||
,pca.second as link_2
|
||||
,e.emotion_id as emotion
|
||||
FROM top.stories s
|
||||
JOIN top.publishers p
|
||||
ON p.id = s.publisher_id
|
||||
JOIN top.story_sentiments sent
|
||||
ON s.id = sent.story_id
|
||||
JOIN election_distance d
|
||||
ON d.election_date = s.published_at
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = p.id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
JOIN top.publisher_pca_onehot pca
|
||||
ON pca.publisher_id = p.id
|
||||
JOIN story_emotions e
|
||||
ON e.story_id = s.id
|
||||
""")
|
||||
DB.close()
|
||||
|
|
|
@ -379,24 +379,34 @@ def debug():
|
|||
|
||||
def another():
|
||||
DB = connect()
|
||||
|
||||
DB.sql("""
|
||||
select
|
||||
*
|
||||
from emotions
|
||||
""")
|
||||
|
||||
emotions = DB.sql("""
|
||||
DB.sql("""
|
||||
select
|
||||
year(s.published_at) as year
|
||||
,se.label as emotion
|
||||
,count(1) as stories
|
||||
from stories s
|
||||
join story_emotions se
|
||||
on s.id = se.story_id
|
||||
group by
|
||||
year(s.published_at)
|
||||
,se.label
|
||||
*
|
||||
from story_emotions
|
||||
""")
|
||||
|
||||
emotions = DB.sql("""
|
||||
SELECT
|
||||
YEAR(s.published_at) AS year
|
||||
,e.label AS emotion
|
||||
,count(1) AS stories
|
||||
FROM stories s
|
||||
JOIN story_emotions se
|
||||
ON s.id = se.story_id
|
||||
JOIN emotions e
|
||||
ON e.id = se.emotion_id
|
||||
GROUP by
|
||||
YEAR(s.published_at)
|
||||
,e.label
|
||||
""").df()
|
||||
emotions
|
||||
|
||||
sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
|
||||
plt.show()
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
import plots.sentence
|
||||
import plots.emotion
|
||||
import plots.sentiment
|
||||
|
||||
__all__ = [
|
||||
'sentence'
|
||||
'emotion',
|
||||
'sentiment',
|
||||
]
|
|
@ -0,0 +1,117 @@
|
|||
import click
|
||||
from data.main import connect
|
||||
import os
|
||||
from pathlib import Path
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
||||
|
||||
@click.command('plot:emotion-over-time')
|
||||
def emotion_over_time():
|
||||
filename = "emotion_over_time.png"
|
||||
DB = connect()
|
||||
|
||||
emotions = DB.sql("""
|
||||
SELECT
|
||||
date_trunc('year', s.published_at) AS year
|
||||
,e.label AS emotion
|
||||
,count(1) AS stories
|
||||
FROM top.stories s
|
||||
JOIN story_emotions se
|
||||
ON s.id = se.story_id
|
||||
JOIN emotions e
|
||||
ON e.id = se.emotion_id
|
||||
GROUP by
|
||||
date_trunc('year', s.published_at)
|
||||
,e.label
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
|
||||
ax.set(title="title emotions over years", xlabel="year", ylabel="stories (#)")
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
|
||||
@click.command('plot:emotion-regression')
|
||||
def emotion_regression():
|
||||
from sklearn import linear_model
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
|
||||
filename = "emotion_regression.png"
|
||||
|
||||
|
||||
DB = connect()
|
||||
emotions = DB.query("""
|
||||
SELECT
|
||||
label
|
||||
FROM emotions e
|
||||
""").df()['label'].to_list()
|
||||
DB.close()
|
||||
|
||||
DB = connect()
|
||||
df = DB.sql(f"""
|
||||
SELECT
|
||||
epoch(date_trunc('yearweek', s.published_at)) AS date
|
||||
,e.id AS emotion_id
|
||||
,p.id as publisher_id
|
||||
,count(1) AS stories
|
||||
FROM top.stories s
|
||||
JOIN top.publishers p
|
||||
ON p.id = s.publisher_id
|
||||
JOIN story_emotions se
|
||||
ON s.id = se.story_id
|
||||
JOIN emotions e
|
||||
ON e.id = se.emotion_id
|
||||
GROUP by
|
||||
epoch(date_trunc('yearweek', s.published_at))
|
||||
,p.id
|
||||
,e.id
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
results = []
|
||||
for (emotion_id, publisher_id), group in df.groupby(['emotion_id', 'publisher_id']):
|
||||
model = linear_model.LinearRegression()
|
||||
x = group['date'].to_numpy().reshape(-1, 1)
|
||||
y = group['stories'].to_numpy()
|
||||
model.fit(x, y)
|
||||
per_year = model.coef_.item() * 60 * 60 * 24 * 365
|
||||
results.append({'emotion_id' : emotion_id, 'publisher_id':publisher_id, 'per_year' : per_year})
|
||||
results = pd.DataFrame(results)
|
||||
|
||||
DB = connect()
|
||||
out = DB.query("""
|
||||
SELECT
|
||||
e.label as emotion
|
||||
--,p.tld
|
||||
,avg(results.per_year) as avg_reg_coef
|
||||
,b.ordinal
|
||||
FROM results
|
||||
JOIN emotions e
|
||||
ON e.id = results.emotion_id
|
||||
JOIN top.publishers p
|
||||
ON p.id = results.publisher_id
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = results.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
GROUP BY
|
||||
e.label
|
||||
,b.ordinal
|
||||
""").df()
|
||||
DB.close()
|
||||
pivot = out.pivot(index=['emotion'], columns=['ordinal'], values=['avg_reg_coef'])
|
||||
|
||||
ax = sns.heatmap(pivot, cmap='RdBu_r')
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="slope of regression (stories/year) by bias and emotion"
|
||||
,xticklabels=ticklabels
|
||||
,xlabel="bias"
|
||||
,ylabel="emotion")
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
|
@ -0,0 +1,111 @@
|
|||
import click
|
||||
from data.main import connect
|
||||
import os
|
||||
from pathlib import Path
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
||||
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
||||
|
||||
@click.command('plot:sentence-pca')
|
||||
def sentence_pca():
|
||||
filename = "embedding_sentence_pca.png"
|
||||
DB = connect()
|
||||
|
||||
data = DB.query("""
|
||||
SELECT
|
||||
pca.first
|
||||
,pca.second
|
||||
,b.bias as label
|
||||
FROM top.story_embeddings_pca pca
|
||||
JOIN top.stories s
|
||||
ON s.id = pca.story_id
|
||||
JOIN top.publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
|
||||
ax.set(title="pca components vs. bias label", xlabel="first component", ylabel="second component")
|
||||
plt.savefig(out_path / filename)
|
||||
|
||||
@click.command('plot:avg-sentence-pca')
|
||||
def avg_sentence_pca():
|
||||
filename = "avg_embedding_sentence_pca.png"
|
||||
DB = connect()
|
||||
|
||||
data = DB.query("""
|
||||
SELECT
|
||||
pca.first
|
||||
,pca.second
|
||||
,p.tld
|
||||
,b.bias as label
|
||||
FROM top.publisher_embeddings_pca pca
|
||||
JOIN top.publishers p
|
||||
ON p.id = pca.publisher_id
|
||||
JOIN top.publisher_bias pb
|
||||
ON pb.publisher_id = p.id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.scatterplot(x=data['first'], y=data['second'], hue=data['label'])
|
||||
ax.set(title="avg. publisher embedding pca components vs. bias label", xlabel="first component", ylabel="second component")
|
||||
plt.savefig(out_path / filename)
|
||||
|
||||
@click.command('plot:sentence-confusion')
|
||||
def sentence_confusion():
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.metrics import ConfusionMatrixDisplay
|
||||
|
||||
filename = "sentence_confusion.png"
|
||||
|
||||
embeddings = np.load(data_path / 'embeddings.npy')
|
||||
embedding_ids = np.load(data_path / 'embedding_ids.npy')
|
||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||
|
||||
DB = connect()
|
||||
data = DB.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
,b.ordinal
|
||||
FROM ids
|
||||
JOIN top.stories s
|
||||
ON ids.story_id = s.id
|
||||
JOIN top.publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
pub = DB.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.publishers
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
train, test = train_test_split(data)
|
||||
train_x, train_y = embeddings[train['index']], train['ordinal']
|
||||
test_x, test_y = embeddings[test['index']], test['ordinal']
|
||||
|
||||
model = KNeighborsClassifier(n_neighbors=5)
|
||||
model.fit(train_x, train_y)
|
||||
pred = model.predict(test_x)
|
||||
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 5))
|
||||
ConfusionMatrixDisplay.from_predictions(test_y, pred, ax=ax)
|
||||
ticklabels = ['left', 'left-center', 'center', 'right-center', 'right']
|
||||
ax.set(title="confusion matrix for kNN classifier on test data.", xticklabels=ticklabels, yticklabels=ticklabels)
|
||||
plt.savefig(out_path / filename)
|
||||
plt.close()
|
||||
|
||||
print(f"saved plot: {filename}")
|
|
@ -0,0 +1,60 @@
|
|||
import click
|
||||
from data.main import connect
|
||||
import os
|
||||
from pathlib import Path
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
out_path = Path(os.getenv('DATA_MINING_DOC_DIR')) / 'figures'
|
||||
@click.command('plot:sentiment-over-time')
|
||||
def over_time():
|
||||
filename = "sentiment_over_time.png"
|
||||
|
||||
DB = connect()
|
||||
data = DB.sql("""
|
||||
SELECT
|
||||
avg(sent.class_id) as sentiment
|
||||
,s.published_at as date
|
||||
FROM top.story_sentiments sent
|
||||
JOIN top.stories s
|
||||
ON s.id = sent.story_id
|
||||
GROUP BY
|
||||
s.published_at
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.scatterplot(x=data['date'], y=data['sentiment'])
|
||||
ax.set(title="sentiment vs. time")
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
||||
@click.command('plot:bias-vs-sentiment-over-time')
|
||||
def bias_over_time():
|
||||
filename = "bias_vs_sentiment_over_time.png"
|
||||
|
||||
DB = connect()
|
||||
data = DB.sql("""
|
||||
SELECT
|
||||
avg(sent.class_id) as sentiment
|
||||
,s.published_at as date
|
||||
,b.id as bias_id
|
||||
FROM top.story_sentiments sent
|
||||
JOIN top.stories s
|
||||
ON s.id = sent.story_id
|
||||
JOIN publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
GROUP BY
|
||||
s.published_at
|
||||
,b.id
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
ax = sns.relplot(x=data['date'], y=data['sentiment'], col=data['bias_id'])
|
||||
ax.set(title="sentiment vs. time grouped by bias")
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path / filename)
|
||||
print(f"saved: {filename}")
|
|
@ -72,16 +72,71 @@ def embed(chunks):
|
|||
print(f"ids saved: {save_to}")
|
||||
|
||||
|
||||
@click.command('sentence:create-pca-table')
|
||||
def create_table():
|
||||
from sklearn import linear_model
|
||||
@click.command('sentence:create-avg-pca-table')
|
||||
def create_avg_pca_table():
|
||||
from sklearn.decomposition import PCA
|
||||
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
||||
|
||||
|
||||
embeddings = np.load(data_path / 'embeddings.npy')
|
||||
embedding_ids = np.load(data_path / 'embedding_ids.npy')
|
||||
ids = pd.DataFrame(embedding_ids, columns=['story_id']).reset_index()
|
||||
DB = connect()
|
||||
|
||||
DB = connect()
|
||||
data = DB.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
,s.id
|
||||
,s.publisher_id
|
||||
,b.ordinal
|
||||
FROM ids
|
||||
JOIN top.stories s
|
||||
ON ids.story_id = s.id
|
||||
JOIN top.publisher_bias pb
|
||||
ON pb.publisher_id = s.publisher_id
|
||||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
results = []
|
||||
for publisher_id, group in data.groupby(['publisher_id']):
|
||||
avg = embeddings[group['index']].mean(axis=0)
|
||||
ordinal = group['ordinal'].iloc[0]
|
||||
results.append({'publisher_id' : publisher_id, 'embedding' : avg, 'ordinal' : ordinal})
|
||||
results = pd.DataFrame(results)
|
||||
|
||||
x = np.stack(results['embedding'])
|
||||
y = results['ordinal']
|
||||
|
||||
model = PCA(n_components=2)
|
||||
pred = model.fit_transform(x)
|
||||
results['first'] = pred[:, 0]
|
||||
results['second'] = pred[:, 1]
|
||||
|
||||
table_name = "top.publisher_embeddings_pca"
|
||||
DB = connect()
|
||||
DB.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
results.publisher_id as publisher_id
|
||||
,results.first as first
|
||||
,results.second as second
|
||||
FROM results
|
||||
""")
|
||||
DB.close()
|
||||
print(f"created {table_name}")
|
||||
|
||||
|
||||
@click.command('sentence:create-pca-table')
|
||||
def create_pca_table():
|
||||
from sklearn.decomposition import PCA
|
||||
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
||||
|
||||
embeddings = np.load(data_path / 'embeddings.npy')
|
||||
embedding_ids = np.load(data_path / 'embedding_ids.npy')
|
||||
|
||||
DB = connect()
|
||||
data = DB.query("""
|
||||
SELECT
|
||||
ids.index
|
||||
|
@ -95,19 +150,38 @@ def create_table():
|
|||
JOIN bias_ratings b
|
||||
ON b.id = pb.bias_id
|
||||
""").df()
|
||||
pub = DB.query("""
|
||||
SELECT
|
||||
*
|
||||
FROM top.publishers
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
x = embeddings[data['index']]
|
||||
y = data['ordinal'].to_numpy().reshape(-1, 1)
|
||||
model = PCA(n_components=2)
|
||||
pred = model.fit_transform(x)
|
||||
data['first'] = pred[:, 0]
|
||||
data['second'] = pred[:, 1]
|
||||
|
||||
reg = linear_model.LinearRegression()
|
||||
table_name = f"top.story_embeddings_pca"
|
||||
|
||||
reg.fit(x, y)
|
||||
|
||||
reg.coef_.shape
|
||||
DB = connect()
|
||||
DB.query(f"""
|
||||
CREATE OR REPLACE TABLE {table_name} AS
|
||||
SELECT
|
||||
data.id as story_id
|
||||
,data.first as first
|
||||
,data.second as second
|
||||
FROM data
|
||||
""")
|
||||
DB.close()
|
||||
print(f"created {table_name}")
|
||||
|
||||
@click.command('sentence:create-svm-table')
|
||||
def create_svm_table():
|
||||
from sklearn import svm
|
||||
from sklearn.linear_model import SGDClassifier
|
||||
data_path = Path(os.getenv('DATA_MINING_DATA_DIR'))
|
||||
|
||||
embeddings = np.load(data_path / 'embeddings.npy')
|
||||
|
@ -133,6 +207,8 @@ def create_svm_table():
|
|||
#y = data['ordinal'].to_numpy().reshape(-1, 1)
|
||||
y = data['ordinal']
|
||||
|
||||
clf = svm.SVC()
|
||||
pred = clf.fit(x, y)
|
||||
model = SGDClassifier()
|
||||
pred = model.fit(x, y)
|
||||
data['pred'] = pred.predict(x)
|
||||
data
|
||||
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from data import connect, data_dir
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import click
|
||||
|
||||
@click.option('-c', '--chunks', type=int, default=500, show_default=True)
|
||||
@click.command("sentiment:extract")
|
||||
def extract(chunks):
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
chunks = 1000
|
||||
|
||||
# Load model from HuggingFace Hub
|
||||
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
||||
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
||||
model = model.to(device)
|
||||
|
||||
|
||||
# load data
|
||||
DB = connect()
|
||||
table = DB.sql("""
|
||||
select
|
||||
id
|
||||
,title
|
||||
from stories
|
||||
order by id desc
|
||||
""").df()
|
||||
DB.close()
|
||||
|
||||
# normalize text
|
||||
table['title'] = table['title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
|
||||
|
||||
|
||||
chunked = np.array_split(table, chunks)
|
||||
|
||||
# generate embeddings from list of titles
|
||||
iterator = tqdm(chunked, 'embedding')
|
||||
sentiments = []
|
||||
story_ids = []
|
||||
for _, chunk in enumerate(iterator):
|
||||
sentences = chunk['title'].tolist()
|
||||
ids = chunk['id'].tolist()
|
||||
# Tokenize sentences
|
||||
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
||||
# Compute token embeddings
|
||||
with torch.no_grad():
|
||||
logits = model(**encoded_input.to(device)).logits
|
||||
sentiment = logits.argmax(axis=1).tolist()
|
||||
sentiments.append(sentiment)
|
||||
story_ids.append(ids)
|
||||
|
||||
sentiments = np.concatenate(sentiments)
|
||||
story_ids = np.concatenate(story_ids)
|
||||
|
||||
# save embeddings
|
||||
save_to = data_dir() / 'sentiment.npy'
|
||||
np.save(save_to, sentiments)
|
||||
print(f"sentiments saved: {save_to}")
|
||||
|
||||
# save ids
|
||||
save_to = data_dir() / 'sentiment_ids.npy'
|
||||
np.save(save_to, story_ids)
|
||||
print(f"ids saved: {save_to}")
|
||||
|
||||
@click.command('sentiment:load')
|
||||
def load():
|
||||
|
||||
DB = connect()
|
||||
sentiments = np.load(data_dir() / 'sentiment.npy')
|
||||
story_ids = np.load(data_dir() / 'sentiment_ids.npy')
|
||||
data = pd.DataFrame(story_ids, columns=['story_id']).reset_index()
|
||||
data['sentiment_id'] = sentiments
|
||||
|
||||
DB.query("""
|
||||
CREATE OR REPLACE TABLE top.story_sentiments AS
|
||||
SELECT
|
||||
data.story_id
|
||||
,data.sentiment_id as class_id
|
||||
,CASE WHEN data.sentiment_id = 1 THEN 'positive' ELSE 'negative' end as label
|
||||
FROM data
|
||||
JOIN top.stories s
|
||||
ON s.id = data.story_id
|
||||
""")
|
||||
DB.close()
|
Loading…
Reference in New Issue