add better normalization. add link similarity.

This commit is contained in:
matt 2023-05-07 22:07:26 -07:00
parent 3a6f97b290
commit 4bd9f46edd
7 changed files with 383 additions and 29 deletions

View File

@ -1,5 +1,29 @@
# Data Mining - CSCI 577 # Data Mining - CSCI 577
# Project Status Report IV
*2023-04-25*
This project report will take the form of an initial draft of the final report, making use of the template discussed in class and made available on Canvas. Minimally, this draft should include the following:
1. Data preparation
2. Policy for dealing with missing attribute values
3. If your project is one of classification, discuss:
a. Intelligent discretization
b. Identification of useless attributes
c. Policy for violations of the adequacy condition and missing
attribute values
4. If your project is one of clustering:
a. Elimination of noise attributes
b. Proper choice or development of distance measures
5. If your project is one of association rule analysis:
a. What are the "market baskets"?
b. How are thresholds for support and confidence developed.
6. In all cases, you should specify:
a. What computational experiments you have conducted, or plan to
conduct.
# Project Status Report III # Project Status Report III
*2023-04-18* *2023-04-18*
@ -35,6 +59,10 @@ I will use the following suite of python tools to conduct my research:
> This progress should also provide a definitive description of your purpose and how you intend to conduct it. > This progress should also provide a definitive description of your purpose and how you intend to conduct it.
> This should take the form of a detailed outline of the procedures you will undertake in exploring your dataset(s) and maximizing the knowledge that can be extracted from it. > This should take the form of a detailed outline of the procedures you will undertake in exploring your dataset(s) and maximizing the knowledge that can be extracted from it.
The ultimate purpose of the project is track the progress of political discourse as a function of time and publisher.
Using a dataset of article titles and publications, the aim of the project is to classify article titles using a sentiment analysis language model.
\newpage \newpage
# Project Status Report II # Project Status Report II

View File

@ -7,14 +7,16 @@ import os
import csv import csv
def map(rating:str) -> int: def map(rating:str) -> int:
mapping = { mapping = {
'right' : 0, 'left' : 0,
'left-center' : 1, 'left-center' : 1,
'center' : 2, 'center' : 2,
'left' : 3, 'right-center' : 3,
'allsides' : 4, 'right' : 4,
'right-center' : 5 'allsides' : -1,
} }
return mapping[rating] return mapping[rating]
@ -35,13 +37,39 @@ def load() -> None:
def normalize() -> None: def normalize() -> None:
DB = connect() DB = connect()
DB.sql("""
CREATE OR REPLACE TABLE publisher_bias AS
WITH cte AS (
SELECT
p.id
,b.bias as label
,JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) as similarity
FROM bias_ratings b
JOIN publishers p
ON JARO_WINKLER_SIMILARITY(LOWER(p.name), LOWER(b.publisher)) > 0.95
),ranked AS (
SELECT
id
,label
,similarity
,ROW_NUMBER() OVER(PARTITION BY id ORDER BY similarity DESC) AS rn
FROM cte
)
SELECT
id
,label
FROM ranked
WHERE ranked.rn = 1
""")
DB.sql(""" DB.sql("""
with cte as ( with cte as (
select select
s.publisher s.publisher_id
,count(1) as stories ,count(1) as stories
from stories s from stories s
group by s.publisher group by s.publisher_id
) )
select select
s.publisher s.publisher

View File

@ -1,10 +1,12 @@
import click import click
from dotenv import load_dotenv
@click.group() @click.group()
def cli(): def cli():
... ...
if __name__ == "__main__": if __name__ == "__main__":
load_dotenv()
import scrape import scrape
cli.add_command(scrape.download) cli.add_command(scrape.download)
cli.add_command(scrape.parse) cli.add_command(scrape.parse)

View File

@ -9,6 +9,8 @@ from model import BertForMultiLabelClassification
from data import connect from data import connect
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
def data(): def data():
# load data # load data
@ -126,24 +128,153 @@ def normalize():
""") """)
DB.close() DB.close()
@click.command("emotion:analyze")
def coef_over_time():
"""plot and group emotional labels"""
DB = connect()
emotions = DB.sql("""
select label from emotions
""").df()
from sklearn import linear_model
from sklearn.model_selection import train_test_split
def results(buckets = '1 month'):
results = DB.sql(f"""
with cte as (
SELECT
time_bucket(interval '{buckets}', s.published_at) as date
,e.label
,COUNT(1) AS stories
FROM stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
WHERE YEAR(s.published_at) < 2022
GROUP BY
time_bucket(interval '{buckets}', s.published_at)
,e.label
)
,total as (
SELECT
time_bucket(interval '{buckets}', s.published_at) as date
,COUNT(1) AS stories
FROM stories s
WHERE YEAR(s.published_at) < 2022
GROUP BY
time_bucket(interval '{buckets}', s.published_at)
)
select
epoch(cte.date) / 60 / 60 / 24 / 365 as date
,cte.label
,cast(cte.stories as float) / t.stories as stories
from cte
join total t
on t.date = cte.date
""").df()
return results
def get_coef(label):
reg = linear_model.LinearRegression()
df = results[results['label'] == label]
x = df['date'].to_numpy().reshape(-1, 1)
y = df['stories']
x_train, x_test = train_test_split(x)
y_train, y_test = train_test_split(y)
reg.fit(x_train, y_train)
# y_pred = reg.predict(x_test)
# sns.lineplot(x=x_test.flatten(), y=y_pred)
return reg.coef_
collection = []
results = results('2 year')
for emotion in emotions['label']:
if emotion == 'neutral':
continue
coef = get_coef(emotion)[0]
if coef > 0:
increasing = True
else:
increasing = False
collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 })
pd.DataFrame(collection).sort_values('coef')
plt.show()
@click.command("emotion:analyze") @click.command("emotion:analyze")
def analyze(): def analyze():
"""plot and group emotional labels""" """plot and group emotional labels"""
DB = connect() DB = connect()
emotions = DB.sql("""
select label from emotions
""").df()
from sklearn import linear_model
from sklearn.model_selection import train_test_split
def get_coef(emotion):
df = DB.sql("""
with cte as (
SELECT
time_bucket(interval '1 month', s.published_at) as date
,e.label
,COUNT(1) AS stories
FROM stories s
JOIN story_emotions se
ON s.id = se.story_id
JOIN emotions e
ON e.id = se.emotion_id
WHERE YEAR(s.published_at) < 2022
--AND e.label in ('neutral', 'annoyance')
AND e.label in ('sadness')
GROUP BY
time_bucket(interval '1 month', s.published_at)
,e.label
)
,total as (
SELECT
time_bucket(interval '1 month', s.published_at) as date
,COUNT(1) AS stories
FROM stories s
WHERE YEAR(s.published_at) < 2022
GROUP BY
time_bucket(interval '1 month', s.published_at)
)
select
epoch(cte.date) as date
,cte.label
--,total.stories as total
,cast(cte.stories as float) / e.stories as stories
from cte
join emotions e
--on total.date = cte.date
on e.label = cte.label
""").df()
reg = linear_model.LinearRegression()
x = df['date'].to_numpy().reshape(-1, 1)
y = df['stories']
x_train, x_test = train_test_split(x)
y_train, y_test = train_test_split(y)
reg.fit(x_train, y_train)
#y_pred = reg.predict(x_test)
return reg.coef_
df = DB.sql(f"""{yearly}""").df()
df['date'] = pd.to_datetime(df['date'])
ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label'])
#ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
plt.locator_params(axis='y', nbins=6)
ax.xaxis.set_major_formatter(DateFormatter("%m-%y"))
plt.show()
DB.sql(""" DB.sql("""
WITH grouped as ( WITH grouped as (
SELECT
YEAR(s.published_at) as year
,e.label
,COUNT(1) AS stories
FROM story_emotions e
JOIN stories s
ON s.id = e.story_id
WHERE YEAR(s.published_at) < 2022
AND label = 'annoyance'
GROUP BY
YEAR(s.published_at)
,e.label
), total AS ( ), total AS (
SELECT SELECT
e.label e.label

111
src/links.py Normal file
View File

@ -0,0 +1,111 @@
from data import connect
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
import seaborn as sns
import matplotlib.pyplot as plt
def to_matrix():
"""returns an adjacency matrix of publishers to publisher link frequency"""
DB = connect()
bias_map = pd.DataFrame([
{'label' :'left', 'value' : 0},
{'label' :'left-center', 'value' : 1},
{'label' :'center', 'value' : 2},
{'label' :'right-center', 'value' : 3},
{'label' :'right', 'value' : 4},
{'label' :'allsides', 'value' : -1},
])
bias = DB.sql("""
SELECT
b.id
,b.label
,m.value
FROM publisher_bias b
JOIN bias_map m
ON b.label = m.label
WHERE value != -1
""").df()
pub = DB.sql("""
select
p.id
,p.name
,p.url
,b.label
,b.value
from publishers p
left join bias b
on b.id = p.id
""").df()
edges = DB.sql("""
WITH total as (
SELECT
s.publisher_id as id
,COUNT(1) as stories
FROM stories s
GROUP BY
s.publisher_id
), p as (
SELECT
p.id
,stories
FROM publishers p
LEFT JOIN total t
ON t.id = p.id
WHERE t.stories >= 20
), cte as (
SELECT
r.publisher_id as child_id
,s.publisher_id as parent_id
,count(1) as links
FROM related_stories r
JOIN stories s
ON s.id = r.parent_id
group by
s.publisher_id
,r.publisher_id
)
SELECT
p.id as parent_id
,cte.child_id
,links
FROM p
left JOIN cte
ON p.id = cte.parent_id
""").df()
adj = edges.pivot(index='parent_id', columns='child_id', values='links').fillna(0)
out = pd.DataFrame(adj.index.values, columns=['id'])
out = pd.merge(out, pub, how='left', on='id')
pca = PCA(n_components=4)
pca_out = pca.fit_transform(adj)
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
svd_out = svd.fit_transform(adj)
x = svd_out[:, 0]
y = svd_out[:, 1]
x = pca_out[:, 0]
y = pca_out[:, 1]
sns.scatterplot(x=x, y=y)
plt.show()
kmeans = MiniBatchKMeans(n_clusters=5, random_state=0, batch_size=6, n_init="auto")
pred = kmeans.fit_predict(pca_out)
sns.scatterplot(x=x, y=y, hue=pred)
plt.show()
sns.scatterplot(x=pub['first'], y=pub['second'], hue=pub['bias'])
plt.show()

View File

@ -7,6 +7,7 @@ from tqdm import tqdm
from data import data_dir, connect from data import data_dir, connect
from lxml import etree from lxml import etree
import pandas as pd import pandas as pd
from urllib.parse import urlparse
@click.command(name='scrape:load') @click.command(name='scrape:load')
@click.option('--directory', type=Path, default=data_dir(), show_default=True) @click.option('--directory', type=Path, default=data_dir(), show_default=True)
@ -103,12 +104,14 @@ def parse(directory, output_dir):
url = item.xpath('.//strong/a')[0].get('href') url = item.xpath('.//strong/a')[0].get('href')
out['url'] = url out['url'] = url
out['publisher_url_domain'] = urlparse(publisher_url).netloc
out['domain'] = urlparse(url).netloc
item_id = hash((page.stem, url)) item_id = hash((page.stem, url))
out['id'] = item_id out['id'] = item_id
old_id = hash((title, page.stem, publisher_url)) # old_id = hash((title, page.stem, publisher_url))
out['old_id'] = old_id # out['old_id'] = old_id
published.append(out) published.append(out)
related = item.xpath(".//span[contains(@class, 'mls')]/a") related = item.xpath(".//span[contains(@class, 'mls')]/a")
@ -118,6 +121,7 @@ def parse(directory, output_dir):
another['url'] = relation.get('href') another['url'] = relation.get('href')
another['publisher'] = relation.text another['publisher'] = relation.text
another['parent_id'] = item_id another['parent_id'] = item_id
another['publisher_domain'] = urlparse(another['url']).netloc
others.append(another) others.append(another)
df = pd.DataFrame(published) df = pd.DataFrame(published)
df.to_csv(output_dir / 'stories.csv', sep='|', index=False) df.to_csv(output_dir / 'stories.csv', sep='|', index=False)
@ -128,6 +132,7 @@ def parse(directory, output_dir):
def normalize(): def normalize():
"""fix database after load. remove duplicates. create publishers.""" """fix database after load. remove duplicates. create publishers."""
DB = connect() DB = connect()
DB.sql(""" DB.sql("""
DELETE FROM stories DELETE FROM stories
WHERE id IN ( WHERE id IN (
@ -146,29 +151,77 @@ def normalize():
OR title_ctn > 1 OR title_ctn > 1
) )
""") """)
DB.sql(""" DB.sql("""
CREATE OR REPLACE TABLE publishers AS CREATE OR REPLACE TABLE publishers AS
with cte as ( with cte as (
SELECT SELECT
s.publisher s.publisher as name
,s.publisher_url ,s.publisher_url_domain as url
FROM stories s FROM stories s
GROUP BY GROUP BY
s.publisher s.publisher
,s.publisher_url ,s.publisher_url_domain
), together AS ( ), together AS (
SELECT SELECT
COALESCE(cte.publisher, r.publisher) AS publisher COALESCE(cte.name, r.publisher) AS name
,cte.publisher_url ,COALESCE(cte.url, r.publisher_domain) as url
FROM cte FROM cte
FULL OUTER JOIN related_stories r FULL OUTER JOIN related_stories r
ON cte.publisher = r.publisher ON cte.url = r.publisher_domain
) )
SELECT SELECT
ROW_NUMBER() OVER() as id ROW_NUMBER() OVER() as id
,t.* ,t.name
,t.url
FROM together t FROM together t
where t.url is not null
GROUP BY GROUP BY
publisher name
,publisher_url ,url
""") """)
DB.sql("""
alter table stories
add column publisher_id bigint
""")
DB.sql("""
update stories
set publisher_id = publishers.id
from publishers
where publishers.url = stories.publisher_url_domain
""")
DB.sql("""
alter table stories alter publisher_id set data type bigint
""")
DB.sql("""
alter table stories drop publisher;
alter table stories drop publisher_url;
alter table stories drop publisher_url_domain;
alter table stories drop domain;
""")
DB.sql("""
alter table related_stories
add column publisher_id bigint
""")
DB.sql("""
update related_stories
set publisher_id = publishers.id
from publishers
where publishers.url = related_stories.publisher_domain
""")
DB.sql("""
alter table related_stories drop publisher;
alter table related_stories drop publisher_domain;
""")

View File

@ -81,3 +81,4 @@ def distance():
min_index = (np.argmin(distances)) min_index = (np.argmin(distances))
closest = np.unravel_index(min_index, distances.shape) closest = np.unravel_index(min_index, distances.shape)
distances.flatten().shape distances.flatten().shape
DB.close()