463 lines
17 KiB
TeX
463 lines
17 KiB
TeX
\documentclass[sigconf,authorversion,nonacm]{acmart}
|
||
|
||
\usepackage{bbold}
|
||
\usepackage{geometry}
|
||
\geometry{margin=0.3in}
|
||
|
||
\nonstopmode
|
||
|
||
\begin{document}
|
||
\title{CSCI577 Final}
|
||
|
||
\maketitle
|
||
|
||
\section{Data Types}
|
||
\begin{table*}
|
||
\caption{data types}
|
||
\begin{tabular}{lll}
|
||
\toprule
|
||
Variable type & Description & Examples \\
|
||
\midrule
|
||
Categorical & & \\
|
||
Nominal (unordered) & Gives only qualitative information & Names, occupations, nationalities, sex, religion\\
|
||
Ordinal (ordered) & Ranking or order is important & Social status, economic class \\
|
||
Numeric & & \\
|
||
Interval & Distance between values has meaning (discrete or continuous) & Year, temperature \\
|
||
Ratio & Ratio of two values has meaning & Wealth, age, prices, wages \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
\end{table*}
|
||
|
||
\section{Math}
|
||
|
||
\begin{displaymath}
|
||
log_{b}a = \frac{log_{x}a}{log_{x}b}
|
||
\end{displaymath}
|
||
|
||
\section{Classification}
|
||
\paragraph{Definition}
|
||
Constructing a method of classifying new instances using information in a training set
|
||
|
||
\begin{itemize}
|
||
\item Naive Bayes (Conditional Probabilities)
|
||
\item Decision Trees
|
||
\item Logistic Regression
|
||
\item Neural Networks
|
||
\end{itemize}
|
||
|
||
\subsection{Naïve Bayes}
|
||
Bayes Theorem:
|
||
\begin{displaymath}
|
||
P(A|B) = \frac{P(A,B)}{P(B)} = \frac{P(B|A) \times P(A}{P(B)}
|
||
\end{displaymath}
|
||
|
||
\paragraph{Calculation}
|
||
Probablity of a class given attributes is a product of probability of that class overall, with the sum product of each individual attribute given the class:
|
||
\begin{displaymath}
|
||
P(c_{i} | v) = P(c_{i}) \prod_{j=1}^{n} P(a_{j} = v_{j} | \text{class} = c_{i})
|
||
\end{displaymath}
|
||
|
||
|
||
\subsection{TDIDT}
|
||
\paragraph{Definition}
|
||
Top-Down Induction of Decision Trees
|
||
|
||
\paragraph{Algorithm}
|
||
Until no more splitting is possible:
|
||
\begin{itemize}
|
||
\item IF all the instances in the training set belong to the same class THEN return the value of the class
|
||
\item ELSE
|
||
\begin{enumerate}
|
||
\item (a) Select an attribute A to split on
|
||
\item (b) Sort the instances in the training set into subsets, one for each value of attribute A
|
||
\item (c) Return a tree with one branch for each non-empty subset
|
||
\begin{itemize}
|
||
\item Each branch having a descendant subtree or a class value produced by applying the algorithm recursively
|
||
\end{itemize}
|
||
\end{enumerate}
|
||
\end{itemize}
|
||
|
||
|
||
\subsection{Adequacy}
|
||
\paragraph{Definition}
|
||
No two instances with the same values of all the attributes may belong to different classes.
|
||
Naive bayes can still be used when this doesn't obtain, as it will still be able to obtain the probabilities of each class.
|
||
kNN can still be used, as long as then multiple datapoints in the same location in euclidean space would still function as expected.
|
||
|
||
\subsection{Overfitting}
|
||
Understand the concept of overfitting and be able to tell how you would know that a classification system overfit
|
||
\paragraph{Definition}
|
||
If classifier generates a decision tree (or other mechanism) too well adapted to the training set
|
||
Performs well on training set, not well on other data.
|
||
Some overfitting inevitable.
|
||
|
||
Remedy:
|
||
\begin{itemize}
|
||
\item Adjust a decision tree while it is being generated: Pre-pruning
|
||
\item Modify the tree after creation: Post-pruning
|
||
\end{itemize}
|
||
|
||
\subsubsection{Clashes}
|
||
Two (or more) instances of a training set have identical attribute values, but different classification.
|
||
Especially a problem for TDIDT's 'Adequacy condition'.
|
||
|
||
\paragraph{Stems from}
|
||
\begin{itemize}
|
||
\item Classification incorrectly recorded
|
||
\item Recorded attributes insufficient - Would need more attributes, normally impossible
|
||
\end{itemize}
|
||
|
||
\paragraph{Solutions}
|
||
\begin{itemize}
|
||
\item Discard the branch to the clashing node from the node above
|
||
\item Of clashing instances, assume the majority label
|
||
\end{itemize}
|
||
|
||
\subsubsection{Prepruning}
|
||
When pre-pruning, may reduce accuracy on training set but may be better on test set (and subsequent) data than unpruned classifier.
|
||
\begin{enumerate}
|
||
\item test whether a termination condition applies.
|
||
\begin{itemize}
|
||
\item If so, current subset is treated as a 'clash set'
|
||
\item Resolve by 'delete branch,' 'majority voting,' etc.
|
||
\end{itemize}
|
||
\item two methods methods
|
||
\begin{itemize}
|
||
\item Size Cuttoff – prune of subset has fewer than X instances
|
||
\item Maximum depth: prune if length of branch exceed Y
|
||
\end{itemize}
|
||
\end{enumerate}
|
||
|
||
\subsubsection{PostPruning}
|
||
\begin{enumerate}
|
||
\item look for a non-leaf nodes that have descendants of length 1.
|
||
\item In this tree, only node G and D are candidates for pruning (consolidation).
|
||
\end{enumerate}
|
||
|
||
|
||
\subsection{Discretizing}
|
||
|
||
\subsubsection{Equal Width Intervals}
|
||
\subsubsection{Pseudo Attributes}
|
||
\subsubsection{Processing Sorted Instance Table}
|
||
\subsubsection{ChiMerge}
|
||
|
||
\paragraph{Rationalization}
|
||
Initially, each distinct value of a numerical attribute $A$ is considered to be one interval.
|
||
$\chi^{2}$ tests are performed for every pair of adjacent intervals.
|
||
Adjacent intervals with the least $\chi^{2}$ values are merged together, because $\chi^{2}$ low values for a pair indicates similar class distributions.
|
||
This merging process proceeds recursively until a predefined stopping criterion is met.
|
||
For two adjacent intervals, if $\chi^{2}$ test concludes that the class is independent intervals should be merged.
|
||
If $\chi^{2}$ test concludes that they are not independent, i.e. the difference in relative class frequency is statistically significant, the two intervals should remain separate.
|
||
|
||
\paragraph{Calculation}
|
||
To calculate expected value for any combination of row and class:
|
||
\begin{enumerate}
|
||
\item Take the product of the corresponding row sum and column sum
|
||
\item Divided by the grand total of the observed values for the two rows.
|
||
\end{enumerate}
|
||
|
||
Then:
|
||
\begin{enumerate}
|
||
\item Using observed and expected values, calculate, for each of the cells:
|
||
\begin{math}
|
||
\frac{(O - E)^{2}}{E}
|
||
\end{math}
|
||
\item Sum each cell's $\chi^{2}$
|
||
\end{enumerate}
|
||
When exceeds $\chi^{2}$ threshold, hypothesis is rejected.
|
||
Small value for supports hypothesis.
|
||
Important adjustment, when $E < 0.5$ replace it with $0.5$.
|
||
\begin{enumerate}
|
||
\item Select the smallest value
|
||
\item Compare it to the threshold
|
||
\item If it falls below the threshold, merge it with the row immediately below it
|
||
\item recalculate $\chi^{2}$, Only need to do this for rows adjacent to the recently merged one.
|
||
\end{enumerate}
|
||
|
||
Large numbers of intervals does little to solve the problem of discretization.
|
||
Just one interval cannot contribute to a decision making process.
|
||
Modify significance level hypothesis of independence must pass, triggering interval merge.
|
||
Set a minimum and a maximum number of intervals
|
||
|
||
\subsection{Entropy}
|
||
\paragraph{Definition}
|
||
Entropy is the measure of the presence of there being more than one possible classification.
|
||
Used for splitting attributes in decision trees
|
||
Entropy minimizes the complexity, number of branches, in the decision tree.
|
||
No guarantee that using entropy will always lead to a small decision Tree
|
||
Used for feature reduction: Calculate the value of information gain for each attribute in the original dataset. Discard all attributes that do not meet a specified criterion.
|
||
Pass the revised dataset to the preferred classification algorithm
|
||
|
||
Entropy has bias towards selecting attributes with a large number of values
|
||
\paragraph{Calculation}
|
||
To decide if you split on an attribute:
|
||
\begin{enumerate}
|
||
\item find the entropy of the data in each of the branches after the split
|
||
\item then take the average of those and use it to find information gain.
|
||
\item The attribute split with the highest information gain (lowest entropy) is selected.
|
||
\end{enumerate}
|
||
|
||
\begin{itemize}
|
||
\item Entropy is always positive or zero
|
||
\item Entropy is zero when $p_{i} = 1$, aka when all instances have the same class
|
||
\item Entropy is at its max value for the \# of classes when all classes are evenly distributed
|
||
\end{itemize}
|
||
|
||
If there are classes, we can denote the proportion of instances with classification $i$ by $p_{i}$ for $i = 1 to K$.
|
||
$p_{i} = \frac{\text{instances of class} i}{\text{total number of instances}}$
|
||
|
||
\begin{displaymath}
|
||
\text{Entropy} = E = -\sum_{i=1}^{K} p_{i} log_{2} p_{i}
|
||
\end{displaymath}
|
||
|
||
where $K =$ non-empty classes and $p_{i} = \frac{\lvert i \rvert}{N}$, instances in class $i$ over total number of instances $N$.
|
||
|
||
|
||
\subsection{GINI}
|
||
|
||
\paragraph{Calculation}
|
||
\begin{enumerate}
|
||
\item For each non-empty column, form the sum of the squares of the values in the body of the table and divide by the column sum.
|
||
\item Add the values obtained for all the columns and divide by N (the number of instances).
|
||
\item Subtract the total from 1.
|
||
\end{enumerate}
|
||
|
||
\subsection{Information Gain}
|
||
\paragraph{Definition}
|
||
The difference between the entropy before and after splitting on a given attribute in a decision tree.
|
||
Maximizing information gain is the same as minimizing $E_{new}$.
|
||
\paragraph{Calculation}
|
||
|
||
\begin{displaymath}
|
||
\text{Information Gain} = E_{\text{start}} - E_{\text{new}}
|
||
\end{displaymath}
|
||
|
||
Starting node:
|
||
|
||
\begin{eqnarray}
|
||
E_{\text{start}} = -\frac{4}{24}log_{2}\frac{4}{24} \\ \nonumber
|
||
-\frac{5}{24}log_{2}\frac{5}{24} \\ \nonumber
|
||
-\frac{15}{24}log_{2}\frac{15}{24}
|
||
\end{eqnarray}
|
||
|
||
After spliting on attribute:
|
||
|
||
\begin{eqnarray}
|
||
E_{\text{new}} = \frac{8}{24}E_{1}\\ \nonumber
|
||
+ \frac{8}{24}E_{2}\\ \nonumber
|
||
+ \frac{8}{24}E_{3}\\ \nonumber
|
||
\end{eqnarray}
|
||
|
||
|
||
\paragraph{Uses}
|
||
|
||
|
||
\section{Clustering}
|
||
\paragraph{Definition}
|
||
Grouping data into seperate groups.
|
||
Use distance metric between two datapoints.
|
||
Groups should be distinct from another and composed of items similar to one another, and different from items in other groups.
|
||
|
||
\subsection{Nearest Neighbors}
|
||
Mainly used when all attribute values are continuous
|
||
|
||
General strategy:
|
||
\begin{enumerate}
|
||
\item Find the $k$ training instances that are closest to the unseen instance.
|
||
\item Take the most commonly occurring classification for these instances.
|
||
\end{enumerate}
|
||
|
||
\begin{itemize}
|
||
\item KMeans
|
||
\item DBSCAN
|
||
\end{itemize}
|
||
|
||
|
||
\section{Sequence Mining}
|
||
\textbf{TODO}
|
||
\paragraph{Definition}
|
||
Finding meaningful, recurring sequences of events
|
||
|
||
\begin{itemize}
|
||
\item A sequence is an ordered list of elements (transactions):
|
||
\begin{displaymath}
|
||
s = <e_1 e_2 e_3>
|
||
\end{displaymath}
|
||
\item Each element contains a collection of events (items):
|
||
\begin{displaymath}
|
||
e_i = {i_1 i_2 i_3 \cdots i_k}
|
||
\end{displaymath}
|
||
\item Each element is attributed to a specific time or location.
|
||
\item Length of a sequence, $\lvert s \rvert$, is given by the number of elements of the sequence.
|
||
\item A k-sequence is a sequence that contains k events (items)
|
||
\end{itemize}
|
||
|
||
\paragraph{Contains}
|
||
A sequence \begin{math} <a_1 a_2 \cdots a_n> \end{math}
|
||
is contained in another sequence \begin{math}<b_1 b_2 \cdots b_m>; (m \geq n)\end{math}
|
||
if there exist integers \begin{math} i_1 < i_2 < \cdots < i_n \end{math}
|
||
such that \begin{math} a_1 \subseteq b_{i1} a_2 \subseteq b_{i2} \cdots a_n \subseteq b_{in} \end{math}.
|
||
|
||
\paragraph{Support}
|
||
The support of a subsequence w is defined as the fraction of data sequences that contain w.
|
||
A sequential pattern is a frequent subsequence (i.e., a subsequence where $\text{support} \geq \text{minsup}$)
|
||
|
||
\subsection{Generalized Sequential Pattern}
|
||
\begin{enumerate}
|
||
\item Make the first pass over the sequence database D to yield all the 1-element frequent sequences
|
||
\item Repeat until no new frequent sequences are found:
|
||
\begin{enumerate}
|
||
\item Candidate Generation: Merge pairs of frequent subsequences found in the $(k-1)$'th pass to generate candidate sequences that contain $k$ items
|
||
\item Initial Pruning: Prune if it is not the case that all of the $k-1$ subsequences of a $k$ sequence are frequent
|
||
\item Support Counting: Make a new pass over the sequence database $D$ to find the support for these candidate sequences
|
||
\item Candidate Elimination: Eliminate candidate k-sequences whose actual support is less than minsup
|
||
\end{enumerate}
|
||
\end{enumerate}
|
||
|
||
\subsection{Counting Methods}
|
||
\begin{itemize}
|
||
\item COBJ: One occurrence per object
|
||
\item CWIN: One occurrence per sliding window
|
||
\item CMINWIN: Number of minimal windows of occurrence
|
||
\item CDIST 0: Distinct occurrences with possibility of event-timestamp overlap
|
||
\item CDIST: Distinct occurrences with no event- timestamp overlap allowed
|
||
\end{itemize}
|
||
|
||
|
||
\section{Association Rule Analysis}
|
||
\paragraph{Definition}
|
||
Given a collection of collections (database of transactions of food items), find items with high co-occurance.
|
||
|
||
Let $m$ be the number possible items that can be bought.
|
||
Let $I$ denote the set of all possible items.
|
||
Possible itemsets: $s^{\lvert I \rvert}$
|
||
An itemset $S$ matches a transaction $T$ (itself an itemset) if $S \subset T$.
|
||
|
||
\subsection{Support}
|
||
|
||
\paragraph{Definition}
|
||
$support(S)$: proportion of itemsets matched by $S$.
|
||
Proportion of transactions that contain all the items in $S$.
|
||
Frequency with which the items in S occur together in the database.
|
||
|
||
\paragraph{Calculation}
|
||
\begin{displaymath}
|
||
\text{support}(S) = \frac{count(S)}{n}
|
||
\end{displaymath}
|
||
where n is the number of transactions in the database.
|
||
|
||
|
||
\subsection{APRIORI}
|
||
|
||
\paragraph{Pseudo-code}
|
||
\begin{enumerate}
|
||
\item Create $L_{1} =$ set of supported itemsets of cardinality one.
|
||
\item Set $k = 2$.
|
||
\item while $(L_{k-1} \neq \empty)$.
|
||
\begin{enumerate}
|
||
\item Create $C_{k}$ from $L_{k-1}$.
|
||
\item Prune all the itemsets in $C_{k}$ that are not supported, to create $L_{K}$.
|
||
\item Increase $k$ by $1$.
|
||
\end{enumerate}
|
||
\item The set of all supported itemsets is $L_1 \cup L_2 \cup \cdots \cup L_k$.
|
||
\end{enumerate}
|
||
|
||
To start the process we construct $C_1$.
|
||
\begin{enumerate}
|
||
\item Set of all itemsets comprising just a single item,
|
||
\item Make a pass through the database counting the number of transactions that match each of these itemsets.
|
||
\item Divide these counts by the number of transactions in the database
|
||
\item Checking for minsup each single-element itemset.
|
||
\item Discard all those with $\text{support} < minsup$ to yield $L_k$.
|
||
\item Continue until is empty.
|
||
\end{enumerate}
|
||
|
||
|
||
\subsection{Confidence}
|
||
|
||
\paragraph{Calculation}
|
||
Confidence of a rule can be calculated either by
|
||
\begin{displaymath}
|
||
Confidence(L \rightarrow R) = \frac{count(L \cup R)}{count(L)}
|
||
\end{displaymath}
|
||
or
|
||
\begin{displaymath}
|
||
Confidence(L \rightarrow R) = \frac{support(L \cup R)}{support(L)}
|
||
\end{displaymath}
|
||
Reject rules where
|
||
\begin{displaymath}
|
||
support < minsup \approx 0.01 = 1\%
|
||
\end{displaymath}
|
||
Also called a frequent|large|supported itemset.
|
||
|
||
Reject rules where
|
||
\begin{displaymath}
|
||
confidence < minconf \approx 0.8 = 80\%
|
||
\end{displaymath}
|
||
|
||
\paragraph{Uses}
|
||
|
||
\subsection{Lift}
|
||
\paragraph{Definition}
|
||
Lift measures how many more times the items in and occur together than would be expected if they were statistically independent.
|
||
Although lift is a useful measure of interestingness it is not always the best one to use.
|
||
In some cases a rule with higher support and lower lift can be more interesting than one with lower support and higher lift because it applies to more cases.
|
||
\paragraph{Calculation}
|
||
|
||
\begin{eqnarray}
|
||
\text{Lift}(L \rightarrow R)
|
||
= \frac{\text{count}(L \cup R)}{\text{count}(L) \times \text{support}(R)} \nonumber
|
||
\\
|
||
= \frac{\text{support}(L \cup R)}{\text{support}(L) \times \text{support}(R)} \nonumber
|
||
\\
|
||
= \frac{\text{confidence}(L \rightarrow R)}{\text{support}(R)} \nonumber
|
||
\\
|
||
= \frac{N \times \text{confidence}(L \rightarrow R)}{\text{count}(R)} \nonumber
|
||
\\
|
||
= \frac{N \times \text{confidence}(R \rightarrow L)}{\text{count}(R)} \nonumber
|
||
\\
|
||
= \text{Lift}(R \rightarrow L)
|
||
\end{eqnarray}
|
||
|
||
\subsection{Leverage}
|
||
\paragraph{Calculation}
|
||
\begin{displaymath}
|
||
\text{leverage}(L \rightarrow R) = \text{support}(L \cup R) - \text{support}(L) \times \text{support}(R)
|
||
\end{displaymath}
|
||
|
||
\subsection{Frequent Itemsets}
|
||
\begin{enumerate}
|
||
\item Find itemsets of size $k$ made from 2 supported itemsets of size $k-1$
|
||
\item For each new itemset:
|
||
\begin{enumerate}
|
||
\item check if every sub-itemset in it also exists in the supported itemsets of size $k - 1$.
|
||
\item If not every sub-itemset does, then prune it
|
||
\end{enumerate}
|
||
\item Now with the final candidates, determine if they have mininum support
|
||
\item To determine association rules, find which itemsets have at least minimum confidence
|
||
\end{enumerate}
|
||
|
||
|
||
\subsection{Rules Possible}
|
||
|
||
The number of ways of selecting $i$ items from the $k$ in a supported itemset of cardinality $k$ for the right-hand side of a rule is given by:
|
||
\begin{displaymath}
|
||
\sideset{_i}{_k}C
|
||
\end{displaymath}
|
||
|
||
Total number of rules:
|
||
|
||
\begin{displaymath}
|
||
\sideset{_k}{_{k-1}}C
|
||
\end{displaymath}
|
||
|
||
\begin{displaymath}
|
||
2^{k} - 2
|
||
\end{displaymath}
|
||
|
||
800 supported itemsets in $C_2$ if $800 \times \frac{799}{2}$.
|
||
|
||
\end{document}
|
||
\endinput
|