wwu-577/docs/final_notecard.tex

\documentclass[sigconf,authorversion,nonacm]{acmart}

\usepackage{bbold}
\usepackage{geometry}
\geometry{margin=0.3in}

\nonstopmode

\begin{document}
\title{CSCI577 Final}

\maketitle

\section{Data Types}
\begin{table*}
    \caption{data types}
    \begin{tabular}{lll}
        \toprule
        Variable type & Description & Examples \\
        \midrule
        Categorical & & \\
        Nominal (unordered) & Gives only qualitative information & Names, occupations, nationalities, sex, religion\\
        Ordinal (ordered) & Ranking or order is important & Social status, economic class \\
        Numeric & & \\
        Interval & Distance between values has meaning (discrete or continuous) & Year, temperature \\
        Ratio & Ratio of two values has meaning & Wealth, age, prices, wages \\
        \bottomrule
    \end{tabular}
\end{table*}

\section{Math}

\begin{displaymath}
    log_{b}a = \frac{log_{x}a}{log_{x}b}
\end{displaymath}

\section{Classification}
\paragraph{Definition}
Constructing a method of classifying new instances using information in a training set

\begin{itemize}
    \item Naive Bayes (Conditional Probabilities)
    \item Decision Trees
    \item Logistic Regression
    \item Neural Networks
\end{itemize}

\subsection{Naïve Bayes}
Bayes Theorem:
\begin{displaymath}
    P(A|B) = \frac{P(A,B)}{P(B)} = \frac{P(B|A) \times P(A}{P(B)}
\end{displaymath}

\paragraph{Calculation}
Probablity of a class given attributes is a product of probability of that class overall, with the sum product of each individual attribute given the class:
\begin{displaymath}
    P(c_{i} | v) = P(c_{i}) \prod_{j=1}^{n} P(a_{j} = v_{j} | \text{class} = c_{i})
\end{displaymath}


\subsection{TDIDT}
\paragraph{Definition}
Top-Down Induction of Decision Trees

\paragraph{Algorithm}
Until no more splitting is possible:
\begin{itemize}
    \item IF all the instances in the training set belong to the same class THEN return the value of the class
    \item ELSE 
    \begin{enumerate}
        \item (a) Select an attribute A to split on
        \item (b) Sort the instances in the training set into subsets, one for each value of attribute A
        \item (c) Return a tree with one branch for each non-empty subset
            \begin{itemize}
                \item Each branch having a descendant subtree or a class value produced by applying the algorithm recursively
            \end{itemize}
    \end{enumerate}
\end{itemize}


\subsection{Adequacy}
\paragraph{Definition}
No two instances with the same values of all the attributes may belong to different classes.
Naive bayes can still be used when this doesn't obtain, as it will still be able to obtain the probabilities of each class.
kNN can still be used, as long as then multiple datapoints in the same location in euclidean space would still function as expected.

\subsection{Overfitting}
Understand the concept of overfitting and be able to tell how you would know that a classification system overfit
\paragraph{Definition} 
If classifier generates a decision tree (or other mechanism) too well adapted to the training set
Performs well on training set, not well on other data.
Some overfitting inevitable.

Remedy:
\begin{itemize}
    \item Adjust a decision tree while it is being generated: Pre-pruning
    \item Modify the tree after creation: Post-pruning
\end{itemize}

\subsubsection{Clashes}
Two (or more) instances of a training set have identical attribute values, but different classification.
Especially a problem for TDIDT's 'Adequacy condition'.

\paragraph{Stems from}
\begin{itemize}
    \item Classification incorrectly recorded
    \item Recorded attributes insufficient - Would need more attributes, normally impossible
\end{itemize}

\paragraph{Solutions}
\begin{itemize}
    \item Discard the branch to the clashing node from the node above
    \item Of clashing instances, assume the majority label
\end{itemize}

\subsubsection{Prepruning}
When pre-pruning, may reduce accuracy on training set but may be better on test set (and subsequent) data than unpruned classifier.
\begin{enumerate}
    \item test whether a termination condition applies.
        \begin{itemize}
            \item If so, current subset is treated as a 'clash set'
            \item Resolve by 'delete branch,' 'majority voting,' etc.
        \end{itemize}
    \item two methods methods
        \begin{itemize}
            \item Size Cuttoff – prune of subset has fewer than X instances
            \item Maximum depth: prune if length of branch exceed Y
        \end{itemize}
\end{enumerate}

\subsubsection{PostPruning}
\begin{enumerate}
    \item look for a non-leaf nodes that have descendants of length 1.
    \item In this tree, only node G and D are candidates for pruning (consolidation).
\end{enumerate}


\subsection{Discretizing}

\subsubsection{Equal Width Intervals}
\subsubsection{Pseudo Attributes}
\subsubsection{Processing Sorted Instance Table}
\subsubsection{ChiMerge}

\paragraph{Rationalization}
Initially, each distinct value of a numerical attribute $A$ is considered to be one interval.
$\chi^{2}$ tests are performed for every pair of adjacent intervals.
Adjacent intervals with the least $\chi^{2}$ values are merged together, because $\chi^{2}$ low values for a pair indicates similar class distributions.
This merging process proceeds recursively until a predefined stopping criterion is met.
For two adjacent intervals, if $\chi^{2}$ test concludes that the class is independent intervals should be merged.
If $\chi^{2}$ test concludes that they are not independent, i.e. the difference in relative class frequency is statistically significant, the two intervals should remain separate.

\paragraph{Calculation}
To calculate expected value for any combination of row and class:
\begin{enumerate}
    \item Take the product of the corresponding row sum and column sum
    \item Divided by the grand total of the observed values for the two rows.
\end{enumerate}

Then:
\begin{enumerate}
    \item Using observed and expected values, calculate, for each of the cells:
    \begin{math}
        \frac{(O - E)^{2}}{E}
    \end{math}
    \item Sum each cell's $\chi^{2}$
\end{enumerate}
When exceeds $\chi^{2}$ threshold, hypothesis is rejected.
Small value for supports hypothesis.
Important adjustment, when $E < 0.5$ replace it with $0.5$.
\begin{enumerate}
    \item Select the smallest value
    \item Compare it to the threshold
    \item If it falls below the threshold, merge it with the row immediately below it
    \item recalculate $\chi^{2}$, Only need to do this for rows adjacent to the recently merged one.
\end{enumerate}

Large numbers of intervals does little to solve the problem of discretization.
Just one interval cannot contribute to a decision making process.
Modify significance level hypothesis of independence must pass, triggering interval merge.
Set a minimum and a maximum number of intervals

\subsection{Entropy}
\paragraph{Definition}
Entropy is the measure of the presence of there being more than one possible classification.
Used for splitting attributes in decision trees
Entropy minimizes the complexity, number of branches, in the decision tree.
No guarantee that using entropy will always lead to a small decision Tree
Used for feature reduction: Calculate the value of information gain for each attribute in the original dataset. Discard all attributes that do not meet a specified criterion.
Pass the revised dataset to the preferred classification algorithm

Entropy has bias towards selecting attributes with a large number of values
\paragraph{Calculation}
To decide if you split on an attribute: 
\begin{enumerate}
    \item find the entropy of the data in each of the branches after the split
    \item then take the average of those and use it to find information gain.
    \item The attribute split with the highest information gain (lowest entropy) is selected.
\end{enumerate}

\begin{itemize}
    \item Entropy is always positive or zero
    \item Entropy is zero when $p_{i} = 1$, aka when all instances have the same class
    \item Entropy is at its max value for the \# of classes when all classes are evenly distributed
\end{itemize}

If there are classes, we can denote the proportion of instances with classification $i$ by $p_{i}$ for $i = 1 to K$.
$p_{i} = \frac{\text{instances of class} i}{\text{total number of instances}}$

\begin{displaymath}
    \text{Entropy} = E = -\sum_{i=1}^{K} p_{i} log_{2} p_{i}
\end{displaymath}

where $K =$ non-empty classes and $p_{i} = \frac{\lvert i \rvert}{N}$, instances in class $i$ over total number of instances $N$.


\subsection{GINI}

\paragraph{Calculation}
\begin{enumerate}
    \item For each non-empty column, form the sum of the squares of the values in the body of the table and divide by the column sum.
    \item Add the values obtained for all the columns and divide by N (the number of instances).
    \item Subtract the total from 1.
\end{enumerate}

\subsection{Information Gain}
\paragraph{Definition}
The difference between the entropy before and after splitting on a given attribute in a decision tree.
Maximizing information gain is the same as minimizing $E_{new}$.
\paragraph{Calculation}

\begin{displaymath}
    \text{Information Gain} = E_{\text{start}} - E_{\text{new}}
\end{displaymath}

Starting node:

\begin{eqnarray}
    E_{\text{start}} = -\frac{4}{24}log_{2}\frac{4}{24} \\ \nonumber
    -\frac{5}{24}log_{2}\frac{5}{24} \\ \nonumber
    -\frac{15}{24}log_{2}\frac{15}{24}
\end{eqnarray}

After spliting on attribute:

\begin{eqnarray}
    E_{\text{new}} = \frac{8}{24}E_{1}\\ \nonumber
    + \frac{8}{24}E_{2}\\ \nonumber
    + \frac{8}{24}E_{3}\\ \nonumber
\end{eqnarray}


\paragraph{Uses}


\section{Clustering}
\paragraph{Definition}
Grouping data into seperate groups.
Use distance metric between two datapoints. 
Groups should be distinct from another and composed of items similar to one another, and different from items in other groups.

\subsection{Nearest Neighbors}
Mainly used when all attribute values are continuous

General strategy:
\begin{enumerate}
    \item Find the $k$ training instances that are closest to the unseen instance.
    \item Take the most commonly occurring classification for these instances.
\end{enumerate}

\begin{itemize}
    \item KMeans
    \item DBSCAN
\end{itemize}


\section{Sequence Mining}
\textbf{TODO}
\paragraph{Definition}
Finding meaningful, recurring sequences of events

\begin{itemize}
    \item A sequence is an ordered list of elements (transactions):
        \begin{displaymath}
            s = <e_1 e_2 e_3>
        \end{displaymath}
    \item Each element contains a collection of events (items):
        \begin{displaymath}
            e_i = {i_1 i_2 i_3 \cdots i_k}
        \end{displaymath}
    \item Each element is attributed to a specific time or location.
    \item Length of a sequence, $\lvert s \rvert$, is given by the number of elements of the sequence.
    \item A k-sequence is a sequence that contains k events (items)
\end{itemize}

\paragraph{Contains}
A sequence \begin{math} <a_1 a_2 \cdots a_n> \end{math} 
is contained in another sequence \begin{math}<b_1 b_2 \cdots b_m>; (m \geq n)\end{math} 
if there exist integers \begin{math} i_1 < i_2 < \cdots < i_n \end{math} 
such that \begin{math} a_1 \subseteq b_{i1} a_2 \subseteq b_{i2} \cdots a_n \subseteq b_{in} \end{math}.

\paragraph{Support}
The support of a subsequence w is defined as the fraction of data sequences that contain w.
A sequential pattern is a frequent subsequence (i.e., a subsequence where $\text{support} \geq \text{minsup}$)

\subsection{Generalized Sequential Pattern}
\begin{enumerate}
    \item Make the first pass over the sequence database D to yield all the 1-element frequent sequences
    \item Repeat until no new frequent sequences are found:
    \begin{enumerate}
        \item Candidate Generation: Merge pairs of frequent subsequences found in the $(k-1)$'th pass to generate candidate sequences that contain $k$ items
        \item Initial Pruning: Prune if it is not the case that all of the $k-1$ subsequences of a $k$ sequence are frequent
        \item Support Counting: Make a new pass over the sequence database $D$ to find the support for these candidate sequences
        \item Candidate Elimination: Eliminate candidate k-sequences whose actual support is less than minsup
    \end{enumerate}
\end{enumerate}

\subsection{Counting Methods}
\begin{itemize}
    \item COBJ: One occurrence per object
    \item CWIN: One occurrence per sliding window
    \item CMINWIN: Number of minimal windows of occurrence
    \item CDIST 0: Distinct occurrences with possibility of event-timestamp overlap
    \item CDIST: Distinct occurrences with no event- timestamp overlap allowed
\end{itemize}


\section{Association Rule Analysis}
\paragraph{Definition}
Given a collection of collections (database of transactions of food items), find items with high co-occurance.

Let $m$ be the number possible items that can be bought.
Let $I$ denote the set of all possible items.
Possible itemsets: $s^{\lvert I \rvert}$
An itemset $S$ matches a transaction $T$ (itself an itemset) if $S \subset T$.

\subsection{Support}

\paragraph{Definition}
$support(S)$: proportion of itemsets matched by $S$. 
Proportion of transactions that contain all the items in $S$. 
Frequency with which the items in S occur together in the database.

\paragraph{Calculation}
\begin{displaymath}
    \text{support}(S) = \frac{count(S)}{n}
\end{displaymath}
where n is the number of transactions in the database.


\subsection{APRIORI}

\paragraph{Pseudo-code}
\begin{enumerate}
    \item Create $L_{1} =$ set of supported itemsets of cardinality one.
    \item Set $k = 2$.
    \item while $(L_{k-1} \neq \empty)$.
    \begin{enumerate}
        \item Create $C_{k}$ from $L_{k-1}$.
        \item Prune all the itemsets in $C_{k}$ that are not supported, to create $L_{K}$.
        \item Increase $k$ by $1$.
    \end{enumerate}
    \item The set of all supported itemsets is $L_1 \cup L_2 \cup \cdots \cup L_k$.
\end{enumerate}

To start the process we construct $C_1$.
\begin{enumerate}
    \item Set of all itemsets comprising just a single item,
    \item Make a pass through the database counting the number of transactions that match each of these itemsets.
    \item Divide these counts by the number of transactions in the database
    \item Checking for minsup each single-element itemset.
    \item Discard all those with $\text{support} < minsup$ to yield $L_k$.
    \item Continue until is empty.
\end{enumerate}


\subsection{Confidence}

\paragraph{Calculation}
Confidence of a rule can be calculated either by
\begin{displaymath}
    Confidence(L \rightarrow R) = \frac{count(L \cup R)}{count(L)}
\end{displaymath}
or 
\begin{displaymath}
    Confidence(L \rightarrow R) = \frac{support(L \cup R)}{support(L)}
\end{displaymath}
Reject rules where 
\begin{displaymath}
    support < minsup \approx 0.01 = 1\%
\end{displaymath}
Also called a frequent|large|supported itemset.

Reject rules where 
\begin{displaymath}
    confidence < minconf \approx 0.8 = 80\%
\end{displaymath}

\paragraph{Uses}

\subsection{Lift}
\paragraph{Definition}
Lift measures how many more times the items in and occur together than would be expected if they were statistically independent.
Although lift is a useful measure of interestingness it is not always the best one to use.
In some cases a rule with higher support and lower lift can be more interesting than one with lower support and higher lift because it applies to more cases.
\paragraph{Calculation}

\begin{eqnarray}
    \text{Lift}(L \rightarrow R) 
    = \frac{\text{count}(L \cup R)}{\text{count}(L) \times \text{support}(R)} \nonumber
    \\
    = \frac{\text{support}(L \cup R)}{\text{support}(L) \times \text{support}(R)}  \nonumber
    \\
    = \frac{\text{confidence}(L \rightarrow R)}{\text{support}(R)} \nonumber
    \\
    = \frac{N \times \text{confidence}(L \rightarrow R)}{\text{count}(R)} \nonumber
    \\
    = \frac{N \times \text{confidence}(R \rightarrow L)}{\text{count}(R)} \nonumber
    \\
    = \text{Lift}(R \rightarrow L)
\end{eqnarray}

\subsection{Leverage}
\paragraph{Calculation}
\begin{displaymath}
    \text{leverage}(L \rightarrow R) = \text{support}(L \cup R) - \text{support}(L) \times \text{support}(R)
\end{displaymath}

\subsection{Frequent Itemsets}
\begin{enumerate}
    \item Find itemsets of size $k$ made from 2 supported itemsets of size $k-1$
    \item For each new itemset: 
        \begin{enumerate}
            \item check if every sub-itemset in it also exists in the supported itemsets of size $k - 1$. 
            \item If not every sub-itemset does, then prune it
        \end{enumerate}
    \item Now with the final candidates, determine if they have mininum support
    \item To determine association rules, find which itemsets have at least minimum confidence
\end{enumerate}


\subsection{Rules Possible}

The number of ways of selecting $i$ items from the $k$ in a supported itemset of cardinality $k$ for the right-hand side of a rule is given by:
\begin{displaymath}
    \sideset{_i}{_k}C
\end{displaymath}

Total number of rules:

\begin{displaymath}
    \sideset{_k}{_{k-1}}C
\end{displaymath}

\begin{displaymath}
    2^{k} - 2
\end{displaymath}

800 supported itemsets in $C_2$ if $800 \times \frac{799}{2}$.

\end{document}
\endinput