wwu-577/src/emotion.py

import click
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np

from transformers import BertTokenizer
from model import BertForMultiLabelClassification
from data.main import connect, data_dir
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates

def data():

    # load data
    DB = connect()
    table = DB.sql("""
        SELECT
            id,
            title
        FROM stories
        WHERE id NOT IN (
            SELECT
                DISTINCT story_id
            FROM story_emotions
        )
        ORDER BY id DESC
    """).df()
    DB.close()

    return table

@click.command("emotion:create-table")
def create_table():
    """create the table to hold the title id and labels."""
    DB = connect()
    table = "story_emotions"
    DB.execute("""
        CREATE OR REPLACE TABLE {table}
        (
            story_id BIGINT,
            label TEXT,
            score REAL
        )
    """)
    DB.close()
    print(f"\"{table}\" created")

@click.command("emotion:extract")
@click.option('-c', '--chunks', type=int, default=5000, show_default=True)
def extract(chunks):
    """extract emotion class labels from titles and put them in the db"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
    model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")
    model.to(device)

    table = data()
    chunked = np.array_split(table.to_numpy(), chunks)
    for part in tqdm(chunked):
        ids = [x[0] for x in part]
        docs = [x[1] for x in part]
        tokens = tokenizer(docs, add_special_tokens = True, truncation = True, padding = "max_length", max_length=92, return_attention_mask = True, return_tensors = "pt")
        tokens = tokens.to(device)
        results = run(model, tokens, ids)
        df = pd.DataFrame(results)
        DB = connect()
        DB.execute('INSERT INTO story_emotions SELECT * FROM df')
        DB.close()

def run(model, tokens, ids):
    threshold = 0.1
    with torch.no_grad():
        outputs = model(**tokens)[0].to('cpu').detach().numpy()
    scores = 1 / (1 + np.exp(-outputs))  # Sigmoid
    results = []
    for i, item in enumerate(scores):
        for idx, s in enumerate(item):
            if s > threshold:
                results.append({"story_id": ids[i], "label" : model.config.id2label[idx], "score": s})
    return results

@click.command("emotion:normalize")
def normalize():
    """normalize the emotion tables."""
    DB = connect()
    DB.sql("""
        CREATE OR REPLACE TABLE emotions AS
        SELECT
            row_number() over() as id
            ,e.label
            ,COUNT(1) AS stories
        FROM story_emotions e
        JOIN stories s
        ON s.id = e.story_id
        -- WHERE YEAR(s.published_at) < 2022
        GROUP BY e.label
        HAVING stories > 1000
        ORDER BY stories DESC
    """)
    DB.sql("""
        ALTER TABLE story_emotions
        ADD COLUMN emotion_id int64
    """)
    DB.sql("""
        UPDATE story_emotions
        SET emotion_id = emotions.id
        FROM emotions
        WHERE emotions.label = story_emotions.label
    """)
    DB.sql("""
        ALTER TABLE story_emotions
        DROP COLUMN label
    """)

    DB.sql("""
        SELECT
            row_number() over() as id
            ,e.label
            ,COUNT(1) AS stories
        FROM story_emotions e
        JOIN stories s
        ON s.id = e.story_id
        -- WHERE YEAR(s.published_at) < 2022
        GROUP BY e.label
        HAVING stories > 1000
        ORDER BY stories DESC
    """)
    DB.close()

@click.command("emotion:analyze")
def coef_over_time():
    """plot and group emotional labels"""
    DB = connect()

    emotions = DB.sql("""
        select label from emotions
    """).df()

    from sklearn import linear_model
    from sklearn.model_selection import train_test_split

    def results(buckets = '1 month'):
        results = DB.sql(f"""
            with cte as (
                SELECT
                    time_bucket(interval '{buckets}', s.published_at) as date
                    ,e.label
                    ,COUNT(1) AS stories
                FROM stories s
                JOIN story_emotions se
                ON s.id = se.story_id
                JOIN emotions e
                ON e.id = se.emotion_id
                WHERE YEAR(s.published_at) < 2022
                GROUP BY
                    time_bucket(interval '{buckets}', s.published_at)
                    ,e.label
            )
            ,total as (
                SELECT
                    time_bucket(interval '{buckets}', s.published_at) as date
                    ,COUNT(1) AS stories
                FROM stories s
                WHERE YEAR(s.published_at) < 2022
                GROUP BY
                    time_bucket(interval '{buckets}', s.published_at)
            )
            select
                epoch(cte.date) / 60 / 60 / 24 / 365 as date
                ,cte.label
                ,cast(cte.stories as float) / t.stories as stories
            from cte
            join total t
            on t.date = cte.date
        """).df()
        return results


    def get_coef(label):
        reg = linear_model.LinearRegression()
        df = results[results['label'] == label]
        x = df['date'].to_numpy().reshape(-1, 1)
        y = df['stories']
        x_train, x_test = train_test_split(x)
        y_train, y_test = train_test_split(y)
        reg.fit(x_train, y_train)
        # y_pred = reg.predict(x_test)
        # sns.lineplot(x=x_test.flatten(), y=y_pred)
        return reg.coef_

    collection = []
    results = results('2 year')
    for emotion in emotions['label']:
        if emotion == 'neutral':
            continue
        coef = get_coef(emotion)[0]
        if coef > 0:
            increasing = True
        else:
            increasing = False
        collection.append({'emotion' : emotion, 'coef' : coef, 'increasing' : coef > 0 })
    pd.DataFrame(collection).sort_values('coef')

    plt.show()

@click.command("emotion:analyze")
def analyze():
    """plot and group emotional labels"""
    DB = connect()

    emotions = DB.sql("""
        select label from emotions
    """).df()

    from sklearn import linear_model
    from sklearn.model_selection import train_test_split
    def get_coef(emotion):
        df = DB.sql("""
            with cte as (
                SELECT
                    time_bucket(interval '1 month', s.published_at) as date
                    ,e.label
                    ,COUNT(1) AS stories
                FROM stories s
                JOIN story_emotions se
                ON s.id = se.story_id
                JOIN emotions e
                ON e.id = se.emotion_id
                WHERE YEAR(s.published_at) < 2022
                --AND e.label in ('neutral', 'annoyance')
                AND e.label in ('sadness')
                GROUP BY
                    time_bucket(interval '1 month', s.published_at)
                    ,e.label
            )
            ,total as (
                SELECT
                    time_bucket(interval '1 month', s.published_at) as date
                    ,COUNT(1) AS stories
                FROM stories s
                WHERE YEAR(s.published_at) < 2022
                GROUP BY
                    time_bucket(interval '1 month', s.published_at)
            )
            select
                epoch(cte.date) as date
                ,cte.label
                --,total.stories as total
                ,cast(cte.stories as float) / e.stories as stories
            from cte
            join emotions e
            --on total.date = cte.date
            on e.label = cte.label
        """).df()

        reg = linear_model.LinearRegression()
        x = df['date'].to_numpy().reshape(-1, 1)
        y = df['stories']

        x_train, x_test = train_test_split(x)
        y_train, y_test = train_test_split(y)
        reg.fit(x_train, y_train)
        #y_pred = reg.predict(x_test)
        return reg.coef_


    df = DB.sql(f"""{yearly}""").df()
    df['date'] = pd.to_datetime(df['date'])
    ax = sns.lineplot(x=df['date'], y=df['stories'], hue=df['label'])
    #ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
    plt.locator_params(axis='y', nbins=6)
    ax.xaxis.set_major_formatter(DateFormatter("%m-%y"))
    plt.show()

    DB.sql("""
        WITH grouped as (
        ), total AS (
            SELECT
                e.label
                ,count(1) as total
            FROM grouped s
            JOIN story_emotions e
            ON e.label = s.label
            GROUP BY
                e.label
        )
        SELECT
            g.year
            ,g.label
            ,100 * (g.stories / CAST(t.total AS float)) AS frac
        FROM grouped g
        JOIN total t
        ON t.label = g.label
        ORDER BY g.label, g.year
    """)
    DB.close()

    sns.lineplot(x=df['year'], y=df['frac'], hue=df['label'])
    plt.show()

def debug():
    from transformers import pipeline

    # load data
    DB = connect()
    table = DB.sql("""
        SELECT
            id,
            title
        FROM stories
        ORDER BY id DESC
    """).df()
    DB.close()

    classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")

    chunks = 5000
    chunked = np.array_split(table, chunks)
    labels = []
    ids = []
    for chunk in tqdm(chunked):
        sentences = chunk['title'].tolist()
        label_ids = chunk['id'].tolist()
        with torch.no_grad():
            emotions = classifier(sentences)
        labels.append(emotions)
        ids.append(label_ids)
    out = pd.DataFrame(np.concatenate(labels).tolist())
    out_ids = pd.DataFrame(np.concatenate(ids).tolist(), columns=['story_id'])
    out = pd.concat([out_ids, out], axis=1)

    DB = connect()
    DB.sql("""
        CREATE OR REPLACE TABLE story_emotions AS
        SELECT
            story_id
            ,label
            ,score
        FROM out
    """)
    DB.sql("""
        CREATE OR REPLACE TABLE emotions AS
        SELECT
            row_number() over() as id
            ,label
            ,count(1) as stories
        FROM story_emotions
        GROUP BY
            label
    """)
    DB.sql("""
        ALTER TABLE story_emotions add emotion_id bigint
    """)
    DB.sql("""
        UPDATE story_emotions
        SET emotion_id = emotions.id
        FROM emotions
        WHERE story_emotions.label = emotions.label
    """)
    DB.sql("""
        ALTER TABLE story_emotions drop column label
    """)
    DB.sql("""
        select
            *
        from emotions
    """)
    DB.sql("""
        select
        * from story_emotions
        limit 4
    """)
    DB.close()

    out.to_csv(data_dir() / 'emotions.csv', sep="|")

def another():
    DB = connect()

    DB.sql("""
        select
            *
        from emotions
    """)

    DB.sql("""
        select
            *
        from story_emotions
    """)

    emotions = DB.sql("""
        SELECT
            YEAR(s.published_at) AS year
            ,e.label AS emotion
            ,count(1) AS stories
        FROM stories s
        JOIN story_emotions se
        ON s.id = se.story_id
        JOIN emotions e
        ON e.id = se.emotion_id
        GROUP by
            YEAR(s.published_at)
            ,e.label
    """).df()
    emotions

    sns.scatterplot(x=emotions['year'], y=emotions['stories'], hue=emotions['emotion'])
    plt.show()

    pivot = emotions.pivot(index='year', columns='emotion', values='stories')
    pivot.reset_index(inplace=True)
    from sklearn.linear_model import LinearRegression
    reg = LinearRegression()

    for emotion in pivot.keys()[1:].tolist():
        _ = reg.fit(pivot['year'].to_numpy().reshape(-1, 1), pivot[emotion])
        print(f"{emotion}: {reg.coef_[0]}")

    fig, ax = plt.subplots()
    #sns.lineplot(x=pivot['anger'], y=pivot['joy'])
    #sns.lineplot(x=pivot['anger'], y=pivot['surprise'], ax=ax)
    sns.lineplot(x=pivot['anger'], y=pivot['fear'], ax=ax)
    sns.lineplot(x=pivot[''], y=pivot['fear'], ax=ax)
    plt.show()

    DB.close()

    normalized = DB.sql("""
        with cte as (
            select
                year(s.published_at) as year
                ,se.label as emotion
                ,b.label as bias
            from stories s
            join story_emotions se
            on s.id = se.story_id
            join publisher_bias b
            on b.id = s.publisher_id
            where b.label != 'allsides'
            and se.label != 'neutral'
        )
        select
            distinct
            year
            ,emotion
            ,bias
            ,cast(count(1) over(partition by year, bias, emotion) as float) / count(1) over(partition by year, bias) as group_count
        from cte
    """).df()

    DB.sql("""
        select
            b.label as bias
            ,count(1) as stories
        from stories s
        join story_emotions se
        on s.id = se.story_id
        join publisher_bias b
        on b.id = s.publisher_id
        group by
            b.label
    """).df()

    another_pivot = emotional_bias.pivot(index=['bias', 'year'], columns='emotion', values='stories')
    another_pivot.reset_index(inplace=True)

    sns.lineplot(data=normalized, x='year', y='group_count', hue='bias', style='emotion')
    plt.show()

    sns.relplot(
        data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line"
        #data=normalized, x="year", y="group_count", hue="emotion", col='bias', kind="line", facet_kws=dict(sharey=False)
    )
    plt.show()

    DB.sql("""
    select
    *
    from another_pivot
    """)