add better normalization. add link similarity.
This commit is contained in:
parent
3a6f97b290
commit
4bd9f46edd
|
@ -1,5 +1,29 @@
|
|||
# Data Mining - CSCI 577
|
||||
|
||||
# Project Status Report IV
|
||||
|
||||
*2023-04-25*
|
||||
|
||||
This project report will take the form of an initial draft of the final report, making use of the template discussed in class and made available on Canvas. Minimally, this draft should include the following:
|
||||
|
||||
1. Data preparation
|
||||
2. Policy for dealing with missing attribute values
|
||||
3. If your project is one of classification, discuss:
|
||||
a. Intelligent discretization
|
||||
b. Identification of useless attributes
|
||||
c. Policy for violations of the adequacy condition and missing
|
||||
attribute values
|
||||
4. If your project is one of clustering:
|
||||
a. Elimination of noise attributes
|
||||
b. Proper choice or development of distance measures
|
||||
5. If your project is one of association rule analysis:
|
||||
a. What are the "market baskets"?
|
||||
b. How are thresholds for support and confidence developed.
|
||||
6. In all cases, you should specify:
|
||||
a. What computational experiments you have conducted, or plan to
|
||||
conduct.
|
||||
|
||||
|
||||
# Project Status Report III
|
||||
|
||||
*2023-04-18*
|
||||
|
@ -35,6 +59,10 @@ I will use the following suite of python tools to conduct my research:
|
|||
> This progress should also provide a definitive description of your purpose and how you intend to conduct it.
|
||||
> This should take the form of a detailed outline of the procedures you will undertake in exploring your dataset(s) and maximizing the knowledge that can be extracted from it.
|
||||
|
||||
The ultimate purpose of the project is track the progress of political discourse as a function of time and publisher.
|
||||
Using a dataset of article titles and publications, the aim of the project is to classify article titles using a sentiment analysis language model.
|
||||
|
||||
|
||||
\newpage
|
||||
|
||||
# Project Status Report II
|
||||
|
|
40
src/bias.py
40
src/bias.py
|
@ -7,14 +7,16 @@ import os
|
|||
import csv
|
||||
|
||||
def map(rating:str) -> int:
|
||||
|
||||
mapping = {
|
||||
'right' : 0,
|
||||
'left' : 0,
|
||||
'left-center' : 1,
|
||||
'center' : 2,
|
||||
'left' : 3,
|
||||
'allsides' : 4,
|
||||
'right-center' : 5
|
||||
'right-center' : 3,
|
||||
'right' : 4,
|
||||
'allsides' : -1,
|
||||
}
|
||||
|
||||
return mapping[rating]
|
||||
|
||||
|
||||
|
@ -35,13 +37,39 @@ def load() -> None:
|
|||
def normalize() -> None:
|
||||
DB = connect()
|
||||
|
||||
DB.sql("""
|
||||
CREATE OR REPLACE TABLE publisher_bias AS
|
||||
WITH cte AS (
|
||||
SELECT
|
||||
p.id
|
||||
,b.bias as label
|
||||
,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
|
||||
FROM bias_ratings b
|
||||
JOIN publishers p
|
||||
ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
|
||||
),ranked AS (
|
||||
SELECT
|
||||
id
|
||||
,label
|
||||
,similarity
|
||||
,ROW_NUMBER() OVER(PARTITION BY id ORDER BY similarity DESC) AS rn
|
||||
FROM cte
|
||||
)
|
||||
SELECT
|
||||
id
|
||||
,label
|
||||
FROM ranked
|
||||
WHERE ranked.rn = 1
|
||||
""")
|
||||
|
||||
|
||||
DB.sql("""
|
||||
with cte as (
|
||||
select
|
||||
s.publisher
|
||||
s.publisher_id
|
||||
,count(1) as stories
|
||||
from stories s
|
||||
group by s.publisher
|
||||
group by s.publisher_id
|
||||
)
|
||||
select
|
||||
s.publisher
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import click
|
||||
from dotenv import load_dotenv
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
...
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_dotenv()
|
||||
import scrape
|
||||
cli.add_command(scrape.download)
|
||||
cli.add_command(scrape.parse)
|
||||
|
|
155
src/emotion.py
155
src/emotion.py
|
@ -9,6 +9,8 @@ from model import BertForMultiLabelClassification
|
|||
from data import connect
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.dates import DateFormatter
|
||||
import matplotlib.dates as mdates
|
||||
|
||||
def data():
|
||||
# load data
|
||||
|
@ -126,24 +128,153 @@ def normalize():
|
|||
""")
|
||||
DB.close()
|
||||
|
||||
@click.command("emotion:analyze")
|
||||
def coef_over_time():
|
||||
"""plot and group emotional labels"""
|
||||
DB = connect()
|
||||
|
||||
emotions = DB.sql("""
|
||||
select label from emotions
|
||||
""").df()
|
||||
|
||||
from sklearn import linear_model
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
def results(buckets = '1 month'):
|
||||
results = DB.sql(f"""
|
||||
with cte as (
|
||||
SELECT
|
||||
time_bucket(interval '{buckets}', s.published_at) as date
|
||||
,e.label
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
JOIN story_emotions se
|
||||
ON s.id = se.story_id
|
||||
JOIN emotions e
|
||||
ON e.id = se.emotion_id
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
GROUP BY
|
||||
time_bucket(interval '{buckets}', s.published_at)
|
||||
,e.label
|
||||
)
|
||||
,total as (
|
||||
SELECT
|
||||
time_bucket(interval '{buckets}', s.published_at) as date
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
GROUP BY
|
||||
time_bucket(interval '{buckets}', s.published_at)
|
||||
)
|
||||
select
|
||||
epoch(cte.date) / 60 / 60 / 24 / 365 as date
|
||||
,cte.label
|
||||
,cast(cte.stories as float) / t.stories as stories
|
||||
from cte
|
||||
join total t
|
||||
on t.date = cte.date
|
||||
""").df()
|
||||
return results
|
||||
|
||||
|
||||
def get_coef(label):
|
||||
reg = linear_model.LinearRegression()
|
||||
df = results[results['label'] == label]
|
||||
x = df['date'].to_numpy().reshape(-1, 1)
|
||||
y = df['stories']
|
||||
x_train, x_test = train_test_split(x)
|
||||
y_train, y_test = train_test_split(y)
|
||||
reg.fit(x_train, y_train)
|
||||
# y_pred = reg.predict(x_test)
|
||||
# sns.lineplot(x=x_test.flatten(), y=y_pred)
|
||||
return reg.coef_
|
||||
|
||||
collection = []
|
||||
results = results('2 year')
|
||||
for emotion in emotions['label']:
|
||||
if emotion == 'neutral':
|
||||
continue
|
||||
coef = get_coef(emotion)[0]
|
||||
if coef > 0:
|
||||
increasing = True
|
||||
else:
|
||||
increasing = False
|
||||
collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 })
|
||||
pd.DataFrame(collection).sort_values('coef')
|
||||
|
||||
plt.show()
|
||||
|
||||
@click.command("emotion:analyze")
|
||||
def analyze():
|
||||
"""plot and group emotional labels"""
|
||||
DB = connect()
|
||||
|
||||
emotions = DB.sql("""
|
||||
select label from emotions
|
||||
""").df()
|
||||
|
||||
from sklearn import linear_model
|
||||
from sklearn.model_selection import train_test_split
|
||||
def get_coef(emotion):
|
||||
df = DB.sql("""
|
||||
with cte as (
|
||||
SELECT
|
||||
time_bucket(interval '1 month', s.published_at) as date
|
||||
,e.label
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
JOIN story_emotions se
|
||||
ON s.id = se.story_id
|
||||
JOIN emotions e
|
||||
ON e.id = se.emotion_id
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
--AND e.label in ('neutral', 'annoyance')
|
||||
AND e.label in ('sadness')
|
||||
GROUP BY
|
||||
time_bucket(interval '1 month', s.published_at)
|
||||
,e.label
|
||||
)
|
||||
,total as (
|
||||
SELECT
|
||||
time_bucket(interval '1 month', s.published_at) as date
|
||||
,COUNT(1) AS stories
|
||||
FROM stories s
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
GROUP BY
|
||||
time_bucket(interval '1 month', s.published_at)
|
||||
)
|
||||
select
|
||||
epoch(cte.date) as date
|
||||
,cte.label
|
||||
--,total.stories as total
|
||||
,cast(cte.stories as float) / e.stories as stories
|
||||
from cte
|
||||
join emotions e
|
||||
--on total.date = cte.date
|
||||
on e.label = cte.label
|
||||
""").df()
|
||||
|
||||
reg = linear_model.LinearRegression()
|
||||
x = df['date'].to_numpy().reshape(-1, 1)
|
||||
y = df['stories']
|
||||
|
||||
x_train, x_test = train_test_split(x)
|
||||
y_train, y_test = train_test_split(y)
|
||||
reg.fit(x_train, y_train)
|
||||
#y_pred = reg.predict(x_test)
|
||||
return reg.coef_
|
||||
|
||||
|
||||
df = DB.sql(f"""{yearly}""").df()
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label'])
|
||||
#ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
|
||||
plt.locator_params(axis='y', nbins=6)
|
||||
ax.xaxis.set_major_formatter(DateFormatter("%m-%y"))
|
||||
plt.show()
|
||||
|
||||
DB.sql("""
|
||||
WITH grouped as (
|
||||
SELECT
|
||||
YEAR(s.published_at) as year
|
||||
,e.label
|
||||
,COUNT(1) AS stories
|
||||
FROM story_emotions e
|
||||
JOIN stories s
|
||||
ON s.id = e.story_id
|
||||
WHERE YEAR(s.published_at) < 2022
|
||||
AND label = 'annoyance'
|
||||
GROUP BY
|
||||
YEAR(s.published_at)
|
||||
,e.label
|
||||
), total AS (
|
||||
SELECT
|
||||
e.label
|
||||
|
|
|
@ -0,0 +1,111 @@
|
|||
from data import connect
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.decomposition import PCA, TruncatedSVD
|
||||
from sklearn.cluster import MiniBatchKMeans
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
|
||||
def to_matrix():
|
||||
"""returns an adjacency matrix of publishers to publisher link frequency"""
|
||||
|
||||
DB = connect()
|
||||
|
||||
bias_map = pd.DataFrame([
|
||||
{'label' :'left', 'value' : 0},
|
||||
{'label' :'left-center', 'value' : 1},
|
||||
{'label' :'center', 'value' : 2},
|
||||
{'label' :'right-center', 'value' : 3},
|
||||
{'label' :'right', 'value' : 4},
|
||||
{'label' :'allsides', 'value' : -1},
|
||||
])
|
||||
bias = DB.sql("""
|
||||
SELECT
|
||||
b.id
|
||||
,b.label
|
||||
,m.value
|
||||
FROM publisher_bias b
|
||||
JOIN bias_map m
|
||||
ON b.label = m.label
|
||||
WHERE value != -1
|
||||
""").df()
|
||||
|
||||
pub = DB.sql("""
|
||||
select
|
||||
p.id
|
||||
,p.name
|
||||
,p.url
|
||||
,b.label
|
||||
,b.value
|
||||
from publishers p
|
||||
left join bias b
|
||||
on b.id = p.id
|
||||
""").df()
|
||||
|
||||
edges = DB.sql("""
|
||||
WITH total as (
|
||||
SELECT
|
||||
s.publisher_id as id
|
||||
,COUNT(1) as stories
|
||||
FROM stories s
|
||||
GROUP BY
|
||||
s.publisher_id
|
||||
), p as (
|
||||
SELECT
|
||||
p.id
|
||||
,stories
|
||||
FROM publishers p
|
||||
LEFT JOIN total t
|
||||
ON t.id = p.id
|
||||
WHERE t.stories >= 20
|
||||
), cte as (
|
||||
SELECT
|
||||
r.publisher_id as child_id
|
||||
,s.publisher_id as parent_id
|
||||
,count(1) as links
|
||||
FROM related_stories r
|
||||
JOIN stories s
|
||||
ON s.id = r.parent_id
|
||||
group by
|
||||
s.publisher_id
|
||||
,r.publisher_id
|
||||
)
|
||||
SELECT
|
||||
p.id as parent_id
|
||||
,cte.child_id
|
||||
,links
|
||||
FROM p
|
||||
left JOIN cte
|
||||
ON p.id = cte.parent_id
|
||||
""").df()
|
||||
|
||||
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
|
||||
|
||||
|
||||
out = pd.DataFrame(adj.index.values, columns=['id'])
|
||||
out = pd.merge(out, pub, how='left', on='id')
|
||||
|
||||
pca = PCA(n_components=4)
|
||||
pca_out = pca.fit_transform(adj)
|
||||
|
||||
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
|
||||
svd_out = svd.fit_transform(adj)
|
||||
|
||||
x = svd_out[:, 0]
|
||||
y = svd_out[:, 1]
|
||||
|
||||
x = pca_out[:, 0]
|
||||
y = pca_out[:, 1]
|
||||
sns.scatterplot(x=x, y=y)
|
||||
plt.show()
|
||||
|
||||
kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
|
||||
pred = kmeans.fit_predict(pca_out)
|
||||
|
||||
sns.scatterplot(x=x, y=y, hue=pred)
|
||||
plt.show()
|
||||
|
||||
sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
|
||||
plt.show()
|
|
@ -7,6 +7,7 @@ from tqdm import tqdm
|
|||
from data import data_dir, connect
|
||||
from lxml import etree
|
||||
import pandas as pd
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@click.command(name='scrape:load')
|
||||
@click.option('--directory', type=Path, default=data_dir(), show_default=True)
|
||||
|
@ -103,12 +104,14 @@ def parse(directory, output_dir):
|
|||
|
||||
url = item.xpath('.//strong/a')[0].get('href')
|
||||
out['url'] = url
|
||||
out['publisher_url_domain'] = urlparse(publisher_url).netloc
|
||||
out['domain'] = urlparse(url).netloc
|
||||
|
||||
item_id = hash((page.stem, url))
|
||||
out['id'] = item_id
|
||||
|
||||
old_id = hash((title, page.stem, publisher_url))
|
||||
out['old_id'] = old_id
|
||||
# old_id = hash((title, page.stem, publisher_url))
|
||||
# out['old_id'] = old_id
|
||||
published.append(out)
|
||||
|
||||
related = item.xpath(".//span[contains(@class, 'mls')]/a")
|
||||
|
@ -118,6 +121,7 @@ def parse(directory, output_dir):
|
|||
another['url'] = relation.get('href')
|
||||
another['publisher'] = relation.text
|
||||
another['parent_id'] = item_id
|
||||
another['publisher_domain'] = urlparse(another['url']).netloc
|
||||
others.append(another)
|
||||
df = pd.DataFrame(published)
|
||||
df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
|
||||
|
@ -128,6 +132,7 @@ def parse(directory, output_dir):
|
|||
def normalize():
|
||||
"""fix database after load. remove duplicates. create publishers."""
|
||||
DB = connect()
|
||||
|
||||
DB.sql("""
|
||||
DELETE FROM stories
|
||||
WHERE id IN (
|
||||
|
@ -146,29 +151,77 @@ def normalize():
|
|||
OR title_ctn > 1
|
||||
)
|
||||
""")
|
||||
|
||||
|
||||
|
||||
DB.sql("""
|
||||
CREATE OR REPLACE TABLE publishers AS
|
||||
with cte as (
|
||||
SELECT
|
||||
s.publisher
|
||||
,s.publisher_url
|
||||
s.publisher as name
|
||||
,s.publisher_url_domain as url
|
||||
FROM stories s
|
||||
GROUP BY
|
||||
s.publisher
|
||||
,s.publisher_url
|
||||
,s.publisher_url_domain
|
||||
), together AS (
|
||||
SELECT
|
||||
COALESCE(cte.publisher, r.publisher) AS publisher
|
||||
,cte.publisher_url
|
||||
COALESCE(cte.name, r.publisher) AS name
|
||||
,COALESCE(cte.url, r.publisher_domain) as url
|
||||
FROM cte
|
||||
FULL OUTER JOIN related_stories r
|
||||
ON cte.publisher = r.publisher
|
||||
ON cte.url = r.publisher_domain
|
||||
)
|
||||
SELECT
|
||||
ROW_NUMBER() OVER() as id
|
||||
,t.*
|
||||
,t.name
|
||||
,t.url
|
||||
FROM together t
|
||||
where t.url is not null
|
||||
GROUP BY
|
||||
publisher
|
||||
,publisher_url
|
||||
name
|
||||
,url
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
alter table stories
|
||||
add column publisher_id bigint
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
update stories
|
||||
set publisher_id = publishers.id
|
||||
from publishers
|
||||
where publishers.url = stories.publisher_url_domain
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
alter table stories alter publisher_id set data type bigint
|
||||
""")
|
||||
|
||||
|
||||
DB.sql("""
|
||||
alter table stories drop publisher;
|
||||
alter table stories drop publisher_url;
|
||||
alter table stories drop publisher_url_domain;
|
||||
alter table stories drop domain;
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
alter table related_stories
|
||||
add column publisher_id bigint
|
||||
""")
|
||||
|
||||
|
||||
DB.sql("""
|
||||
update related_stories
|
||||
set publisher_id = publishers.id
|
||||
from publishers
|
||||
where publishers.url = related_stories.publisher_domain
|
||||
""")
|
||||
|
||||
DB.sql("""
|
||||
alter table related_stories drop publisher;
|
||||
alter table related_stories drop publisher_domain;
|
||||
""")
|
||||
|
||||
|
|
|
@ -81,3 +81,4 @@ def distance():
|
|||
min_index = (np.argmin(distances))
|
||||
closest = np.unravel_index(min_index, distances.shape)
|
||||
distances.flatten().shape
|
||||
DB.close()
|
||||
|
|
Loading…
Reference in New Issue