wwu-577/src/cli.py

66 lines
1.8 KiB
Python

import polars as pl
import duckdb
import toml
import os
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from enum import Enum, auto
import click
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
db = duckdb.connect(str(DATA_DIR / 'project.duckdb'))
@click.group()
def cli():
...
class PlotName(str, Enum):
TitleLength = "title_len"
OutletStories = "outlet_stories"
@cli.command()
@click.option('-n', '--name', required=True, type=click.Choice(PlotName))
@click.option('-o', '--output', required=False, type=click.Path())
def plot(name: PlotName, output: Path):
output = output if output else APP_DIR / f'docs/{name}.png'
if name == PlotName.TitleLength:
fig, ax = plt.subplots(1,1)
data = db.sql("""
select
length(title) as len
from stories
""").df()
sns.histplot(x=data['len'], bins=50, ax=ax[0])
ax[0].set(ylabel="count", xlabel="title length")
elif name == PlotName.OutletStories:
data = db.sql("""
with cte as (
select
count(1) as stories
from stories
group by outlet
)
select
row_number() over(order by stories desc) as id
,log(stories) as log_count
from cte
""").df()
fig, ax = plt.subplots(1,1)
sns.lineplot(x=data['id'], y=data['log_count'], ax=ax)
from matplotlib.ticker import ScalarFormatter
ax.set(yscale='log', xlabel="outlet", ylabel="log(count of stories)", majorformater=ScalarFormatter)
plt.show()
else:
raise NotImplementedError("option unrecognized")
plt.savefig(output)
if __name__ == "__main__":
cli()