commit b9c63414a0a77e31d0936df328ebd24a845086e2 Author: matt Date: Tue Apr 11 13:27:56 2023 -0700 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c682433 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.csv +*.swp diff --git a/docs/title_len.png b/docs/title_len.png new file mode 100644 index 0000000..6438090 Binary files /dev/null and b/docs/title_len.png differ diff --git a/src/cli.py b/src/cli.py new file mode 100644 index 0000000..3e1464d --- /dev/null +++ b/src/cli.py @@ -0,0 +1,65 @@ +import polars as pl +import duckdb +import toml +import os +from pathlib import Path +import seaborn as sns +import matplotlib.pyplot as plt +from enum import Enum, auto +import click + +DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) +APP_DIR = Path(os.environ['DATA_MINING_APP_DIR']) + +db = duckdb.connect(str(DATA_DIR / 'project.duckdb')) + +@click.group() +def cli(): + ... + + +class PlotName(str, Enum): + TitleLength = "title_len" + OutletStories = "outlet_stories" + +@cli.command() +@click.option('-n', '--name', required=True, type=click.Choice(PlotName)) +@click.option('-o', '--output', required=False, type=click.Path()) +def plot(name: PlotName, output: Path): + output = output if output else APP_DIR / f'docs/{name}.png' + if name == PlotName.TitleLength: + fig, ax = plt.subplots(1,1) + data = db.sql(""" + select + length(title) as len + from stories + """).df() + sns.histplot(x=data['len'], bins=50, ax=ax[0]) + ax[0].set(ylabel="count", xlabel="title length") + elif name == PlotName.OutletStories: + + data = db.sql(""" + with cte as ( + select + count(1) as stories + from stories + group by outlet + ) + select + row_number() over(order by stories desc) as id + ,log(stories) as log_count + from cte + """).df() + + fig, ax = plt.subplots(1,1) + sns.lineplot(x=data['id'], y=data['log_count'], ax=ax) + from matplotlib.ticker import ScalarFormatter + ax.set(yscale='log', xlabel="outlet", ylabel="log(count of stories)", majorformater=ScalarFormatter) + plt.show() + + else: + raise NotImplementedError("option unrecognized") + plt.savefig(output) + +if __name__ == "__main__": + cli() diff --git a/src/data.py b/src/data.py new file mode 100644 index 0000000..c55324c --- /dev/null +++ b/src/data.py @@ -0,0 +1,21 @@ +import os +from pathlib import Path +import duckdb +from enum import Enum + +class Data(str, Enum): + Titles = 'titles' + +def from_db(t: Data): + DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) + # APP_DIR = Path(os.environ['DATA_MINING_APP_DIR']) + DB = duckdb.connect(str(DATA_DIR / 'project.duckdb')) + if t == Data.Titles: + table = DB.sql(""" + select + distinct + title + from stories + limit 100 + """).df() + return table diff --git a/src/lib.py b/src/lib.py new file mode 100644 index 0000000..deef1b2 --- /dev/null +++ b/src/lib.py @@ -0,0 +1,8 @@ +import sklearn +import polars as pl +import toml +from pathlib import Path + +config = toml.load('/home/user/577/repo/config.toml') +app_dir = Path(config.get('app').get('path')) +df = pl.read_csv(app_dir / "data/articles.csv") diff --git a/src/model.py b/src/model.py new file mode 100644 index 0000000..ec7558c --- /dev/null +++ b/src/model.py @@ -0,0 +1,16 @@ +from transformers import AutoTokenizer, RobertaModel +import torch +from torch import nn + +class Model(nn.Module): + def __init__(self): + super().__init__() + self.n_classes = 10 + self.bert = RobertaModel.from_pretrained("roberta-base") + self.linear = torch.nn.Linear(self.bert.config.hidden_size, self.n_classes) + self.act = torch.nn.Sigmoid() + + def forward(self, x): + outs = self.bert(**x) + outs = self.act(self.linear(outs.last_hidden_state)) + return outs diff --git a/src/nearest_neighbor.py b/src/nearest_neighbor.py new file mode 100644 index 0000000..026628e --- /dev/null +++ b/src/nearest_neighbor.py @@ -0,0 +1,5 @@ +import pandas as pd +import math + +df = pd.read_csv('/tmp/attr.csv') +((((df.left - 9.1) ** 2) + ((df.right - 11.0) ** 2)) ** 0.5).sort_values() diff --git a/src/word.py b/src/word.py new file mode 100644 index 0000000..88a35ef --- /dev/null +++ b/src/word.py @@ -0,0 +1,43 @@ +import click +from scipy.spatial import distance +from transformers import AutoTokenizer, RobertaModel +import numpy as np +from model import Model +from data import Data, from_db + +@click.group() +def cli(): + ... + +@cli.command() +def train(): + table = from_db(Data.Titles) + n_classes = 10 + tokenizer = AutoTokenizer.from_pretrained("roberta-base") + + # create tokens, padding to max width + tokens = tokenizer(table['title'].apply(str).to_list(), add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt") + pred_y = outputs[:, 0, :] + + model = RobertaModel.from_pretrained("roberta-base") + pred_y = model(**inputs) + outputs = model(**tokens) + + # linear = torch.nn.Linear(model.config.hidden_size, n_classes) + # act = torch.nn.Sigmoid() + + # model = Model() + pred_y.last_hidden_state[:, 0, :].shape + classes = act(linear(pred_y.last_hidden_state[:, 0, :])).detach() + +@cli.command() +def distance(): + distances = distance.cdist(classes, classes, 'euclidean') + np.fill_diagonal(distances, np.inf) + min_index = (np.argmin(distances)) + closest = np.unravel_index(min_index, distances.shape) + distances.flatten().shape + + +if __name__ == "__main__": + cli()