66 lines
1.8 KiB
Python
66 lines
1.8 KiB
Python
import polars as pl
|
|
import duckdb
|
|
import toml
|
|
import os
|
|
from pathlib import Path
|
|
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
from enum import Enum, auto
|
|
import click
|
|
|
|
DATA_DIR = Path(os.environ['DATA_MINING_DATA_DIR'])
|
|
APP_DIR = Path(os.environ['DATA_MINING_APP_DIR'])
|
|
|
|
db = duckdb.connect(str(DATA_DIR / 'project.duckdb'))
|
|
|
|
@click.group()
|
|
def cli():
|
|
...
|
|
|
|
|
|
class PlotName(str, Enum):
|
|
TitleLength = "title_len"
|
|
OutletStories = "outlet_stories"
|
|
|
|
@cli.command()
|
|
@click.option('-n', '--name', required=True, type=click.Choice(PlotName))
|
|
@click.option('-o', '--output', required=False, type=click.Path())
|
|
def plot(name: PlotName, output: Path):
|
|
output = output if output else APP_DIR / f'docs/{name}.png'
|
|
if name == PlotName.TitleLength:
|
|
fig, ax = plt.subplots(1,1)
|
|
data = db.sql("""
|
|
select
|
|
length(title) as len
|
|
from stories
|
|
""").df()
|
|
sns.histplot(x=data['len'], bins=50, ax=ax[0])
|
|
ax[0].set(ylabel="count", xlabel="title length")
|
|
elif name == PlotName.OutletStories:
|
|
|
|
data = db.sql("""
|
|
with cte as (
|
|
select
|
|
count(1) as stories
|
|
from stories
|
|
group by outlet
|
|
)
|
|
select
|
|
row_number() over(order by stories desc) as id
|
|
,log(stories) as log_count
|
|
from cte
|
|
""").df()
|
|
|
|
fig, ax = plt.subplots(1,1)
|
|
sns.lineplot(x=data['id'], y=data['log_count'], ax=ax)
|
|
from matplotlib.ticker import ScalarFormatter
|
|
ax.set(yscale='log', xlabel="outlet", ylabel="log(count of stories)", majorformater=ScalarFormatter)
|
|
plt.show()
|
|
|
|
else:
|
|
raise NotImplementedError("option unrecognized")
|
|
plt.savefig(output)
|
|
|
|
if __name__ == "__main__":
|
|
cli()
|