import polars as pl import duckdb import toml import os from pathlib import Path import seaborn as sns import matplotlib.pyplot as plt from enum import Enum, auto import click DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR']) APP_DIR = Path(os.environ['DATA_MINING_APP_DIR']) db = duckdb.connect(str(DATA_DIR / 'project.duckdb')) @click.group() def cli(): ... class PlotName(str, Enum): TitleLength = "title_len" OutletStories = "outlet_stories" @cli.command() @click.option('-n', '--name', required=True, type=click.Choice(PlotName)) @click.option('-o', '--output', required=False, type=click.Path()) def plot(name: PlotName, output: Path): output = output if output else APP_DIR / f'docs/{name}.png' if name == PlotName.TitleLength: fig, ax = plt.subplots(1,1) data = db.sql(""" select length(title) as len from stories """).df() sns.histplot(x=data['len'], bins=50, ax=ax[0]) ax[0].set(ylabel="count", xlabel="title length") elif name == PlotName.OutletStories: data = db.sql(""" with cte as ( select count(1) as stories from stories group by outlet ) select row_number() over(order by stories desc) as id ,log(stories) as log_count from cte """).df() fig, ax = plt.subplots(1,1) sns.lineplot(x=data['id'], y=data['log_count'], ax=ax) from matplotlib.ticker import ScalarFormatter ax.set(yscale='log', xlabel="outlet", ylabel="log(count of stories)", majorformater=ScalarFormatter) plt.show() else: raise NotImplementedError("option unrecognized") plt.savefig(output) if __name__ == "__main__": cli()