add loading csv data to database.
This commit is contained in:
parent
feb3a4b8ed
commit
297aeec32d
|
@ -4,7 +4,7 @@ import requests
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import click
|
import click
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from data import data_dir
|
from data import data_dir, connect
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
@ -12,6 +12,29 @@ import pandas as pd
|
||||||
def cli():
|
def cli():
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.option('--directory', type=Path, default=data_dir())
|
||||||
|
@click.option('--database', type=Path, default=data_dir() / "stories.duckdb")
|
||||||
|
def load(directory, database):
|
||||||
|
stories = directory / "stories.csv"
|
||||||
|
related = directory / "related.csv"
|
||||||
|
db = connect()
|
||||||
|
|
||||||
|
db.sql(f"""
|
||||||
|
CREATE TABLE stories AS
|
||||||
|
SELECT
|
||||||
|
*
|
||||||
|
FROM read_csv_auto('{stories}')
|
||||||
|
""")
|
||||||
|
|
||||||
|
db.sql(f"""
|
||||||
|
CREATE TABLE related_stories AS
|
||||||
|
SELECT
|
||||||
|
*
|
||||||
|
FROM read_csv_auto('{related}')
|
||||||
|
""")
|
||||||
|
db.close()
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum")
|
@click.option('-o', 'output_dir', type=Path, default=data_dir() / "memeorandum")
|
||||||
def download(output_dir):
|
def download(output_dir):
|
||||||
|
@ -20,6 +43,7 @@ def download(output_dir):
|
||||||
end = date.today()
|
end = date.today()
|
||||||
dates = []
|
dates = []
|
||||||
while cur <= end:
|
while cur <= end:
|
||||||
|
if not (output_dir / f"{cur.strftime('%y-%m-%d')}.html").exists():
|
||||||
dates.append(cur)
|
dates.append(cur)
|
||||||
cur = cur + day
|
cur = cur + day
|
||||||
date_iter = tqdm(dates, postfix="test")
|
date_iter = tqdm(dates, postfix="test")
|
||||||
|
@ -51,6 +75,9 @@ def parse(directory, output_dir):
|
||||||
# tree = etree.parse(str(page), parser)
|
# tree = etree.parse(str(page), parser)
|
||||||
tree = etree.parse(str(page), parser)
|
tree = etree.parse(str(page), parser)
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
|
if not root:
|
||||||
|
print(f"error opening {page}")
|
||||||
|
continue
|
||||||
items = root.xpath("//div[contains(@class, 'item')]")
|
items = root.xpath("//div[contains(@class, 'item')]")
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
|
@ -64,8 +91,11 @@ def parse(directory, output_dir):
|
||||||
else:
|
else:
|
||||||
author = ''
|
author = ''
|
||||||
out['author'] = author
|
out['author'] = author
|
||||||
|
try:
|
||||||
url = citation[0].getchildren()[0].get('href')
|
url = citation[0].getchildren()[0].get('href')
|
||||||
publisher = citation[0].getchildren()[0].text
|
publisher = citation[0].getchildren()[0].text
|
||||||
|
except IndexError as e:
|
||||||
|
print(f"error with citation url: {page}")
|
||||||
out['publisher'] = publisher
|
out['publisher'] = publisher
|
||||||
out['publisher_url'] = url
|
out['publisher_url'] = url
|
||||||
title = item.xpath('.//strong/a')[0].text
|
title = item.xpath('.//strong/a')[0].text
|
||||||
|
|
15
src/word.py
15
src/word.py
|
@ -14,20 +14,21 @@ def train():
|
||||||
table = from_db(Data.Titles)
|
table = from_db(Data.Titles)
|
||||||
n_classes = 10
|
n_classes = 10
|
||||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||||
|
|
||||||
# create tokens, padding to max width
|
|
||||||
tokens = tokenizer(table['title'].apply(str).to_list(), add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
|
|
||||||
pred_y = outputs[:, 0, :]
|
|
||||||
|
|
||||||
model = RobertaModel.from_pretrained("roberta-base")
|
model = RobertaModel.from_pretrained("roberta-base")
|
||||||
pred_y = model(**inputs)
|
|
||||||
|
def get_embeddings(titles):
|
||||||
|
# create tokens, padding to max width
|
||||||
|
tokens = tokenizer(titles, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
|
||||||
outputs = model(**tokens)
|
outputs = model(**tokens)
|
||||||
|
return outputs.last_hidden_state[:, 0, :]
|
||||||
|
|
||||||
|
titles = table['title'].apply(str).to_list()[:10]
|
||||||
|
get_embeddings(titles)
|
||||||
|
|
||||||
# linear = torch.nn.Linear(model.config.hidden_size, n_classes)
|
# linear = torch.nn.Linear(model.config.hidden_size, n_classes)
|
||||||
# act = torch.nn.Sigmoid()
|
# act = torch.nn.Sigmoid()
|
||||||
|
|
||||||
# model = Model()
|
# model = Model()
|
||||||
pred_y.last_hidden_state[:, 0, :].shape
|
|
||||||
classes = act(linear(pred_y.last_hidden_state[:, 0, :])).detach()
|
classes = act(linear(pred_y.last_hidden_state[:, 0, :])).detach()
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
|
|
Loading…
Reference in New Issue