add final notecard
This commit is contained in:
parent
e37d39bc4a
commit
e74feb73f4
Binary file not shown.
|
@ -0,0 +1,352 @@
|
|||
\documentclass[sigconf,authorversion,nonacm]{acmart}
|
||||
|
||||
\usepackage{bbold}
|
||||
\usepackage{geometry}
|
||||
\geometry{margin=0.3in}
|
||||
|
||||
\nonstopmode
|
||||
|
||||
\begin{document}
|
||||
\title{CSCI577 Final}
|
||||
|
||||
\maketitle
|
||||
|
||||
\section{Classification}
|
||||
\paragraph{Definition}
|
||||
Constructing a method of classifying new instances using information in a training set
|
||||
|
||||
\begin{itemize}
|
||||
\item Naive Bayes (Conditional Probabilities)
|
||||
\item Decision Trees
|
||||
\item Logistic Regression
|
||||
\item Neural Networks
|
||||
\end{itemize}
|
||||
|
||||
\subsection{TDIDT}
|
||||
\paragraph{Definition}
|
||||
Top-Down Induction of Decision Trees
|
||||
|
||||
\subsection{Adequacy}
|
||||
\paragraph{Definition}
|
||||
No two instances with the same values of all the attributes may belong to different classes.
|
||||
Naive bayes can still be used when this doesn't obtain, as it will still be able to obtain the probabilities of each class.
|
||||
kNN can still be used, as long as then multiple datapoints in the same location in euclidean space would still function as expected.
|
||||
|
||||
\paragraph{Application}
|
||||
TDIDT algrorithm is negatively affected
|
||||
|
||||
\paragraph{Algorithm}
|
||||
Until no more splitting is possible:
|
||||
\begin{itemize}
|
||||
\item IF all the instances in the training set belong to the same class THEN return the value of the class
|
||||
\item ELSE
|
||||
\begin{enumerate}
|
||||
\item (a) Select an attribute A to split on
|
||||
\item (b) Sort the instances in the training set into subsets, one for each value of attribute A
|
||||
\item (c) Return a tree with one branch for each non-empty subset
|
||||
\begin{itemize}
|
||||
\item Each branch having a descendant subtree or a class value produced by applying the algorithm recursively
|
||||
\end{itemize}
|
||||
\end{enumerate}
|
||||
\end{itemize}
|
||||
|
||||
\subsection{Overfitting}
|
||||
Understand the concept of overfitting and be able to tell how you would know that a classification system overfit
|
||||
\paragraph{Definition}
|
||||
If classifier generates a decision tree (or other mechanism) too well adapted to the training set
|
||||
Performs well on training set, not well on other data.
|
||||
Some overfitting inevitable.
|
||||
|
||||
Remedy:
|
||||
\begin{itemize}
|
||||
\item Adjust a decision tree while it is being generated: Pre-pruning
|
||||
\item Modify the tree after creation: Post-pruning
|
||||
\end{itemize}
|
||||
|
||||
\subsubsection{Clashes}
|
||||
Two (or more) instances of a training set have identical attribute values, but different classification.
|
||||
Especially a problem for TDIDT's 'Adequacy condition'.
|
||||
|
||||
\paragraph{Stems from}
|
||||
\begin{itemize}
|
||||
\item Classification incorrectly recorded
|
||||
\item Recorded attributes insufficient - Would need more attributes, normally impossible
|
||||
\end{itemize}
|
||||
|
||||
\paragraph{Solutions}
|
||||
\begin{itemize}
|
||||
\item Discard the branch to the clashing node from the node above
|
||||
\item Of clashing instances, assume the majority label
|
||||
\end{itemize}
|
||||
|
||||
\subsubsection{Prepruning}
|
||||
When pre-pruning, may reduce accuracy on training set but may be better on test set (and subsequent) data than unpruned classifier.
|
||||
\begin{enumerate}
|
||||
\item test whether a termination condition applies.
|
||||
\begin{itemize}
|
||||
\item If so, current subset is treated as a 'clash set'
|
||||
\item Resolve by 'delete branch,' 'majority voting,' etc.
|
||||
\end{itemize}
|
||||
\item two methods methods
|
||||
\begin{itemize}
|
||||
\item Size Cuttoff – prune of subset has fewer than X instances
|
||||
\item Maximum depth: prune if length of branch exceed Y
|
||||
\end{itemize}
|
||||
\end{enumerate}
|
||||
|
||||
\subsubsection{PostPruning}
|
||||
\begin{enumerate}
|
||||
\item look for a non-leaf nodes that have descendants of length 1.
|
||||
\item In this tree, only node G and D are candidates for pruning (consolidation).
|
||||
\end{enumerate}
|
||||
|
||||
|
||||
\subsection{Discretizing}
|
||||
|
||||
\subsubsection{Equal Width Intervals}
|
||||
\subsubsection{Pseudo Attributes}
|
||||
\subsubsection{Processing Sorted Instance Table}
|
||||
\subsubsection{ChiMerge}
|
||||
|
||||
\paragraph{Rationalization}
|
||||
Initially, each distinct value of a numerical attribute $A$ is considered to be one interval.
|
||||
$\chi^{2}$ tests are performed for every pair of adjacent intervals.
|
||||
Adjacent intervals with the least $\chi^{2}$ values are merged together, because $\chi^{2}$ low values for a pair indicates similar class distributions.
|
||||
This merging process proceeds recursively until a predefined stopping criterion is met.
|
||||
For two adjacent intervals, if $\chi^{2}$ test concludes that the class is independent intervals should be merged.
|
||||
If $\chi^{2}$ test concludes that they are not independent, i.e. the difference in relative class frequency is statistically significant, the two intervals should remain separate.
|
||||
|
||||
\paragraph{Calculation}
|
||||
To calculate expected value for any combination of row and class:
|
||||
\begin{enumerate}
|
||||
\item Take the product of the corresponding row sum and column sum
|
||||
\item Divided by the grand total of the observed values for the two rows.
|
||||
\end{enumerate}
|
||||
|
||||
Then:
|
||||
\begin{enumerate}
|
||||
\item Using observed and expected values, calculate, for each of the cells:
|
||||
\begin{math}
|
||||
\frac{(O - E)^{2}}{E}
|
||||
\end{math}
|
||||
\item Sum each cell's $\chi^{2}$
|
||||
\end{enumerate}
|
||||
When exceeds $\chi^{2}$ threshold, hypothesis is rejected.
|
||||
Small value for supports hypothesis.
|
||||
Important adjustment, when $E < 0.5$ replace it with $0.5$.
|
||||
\begin{enumerate}
|
||||
\item Select the smallest value
|
||||
\item Compare it to the threshold
|
||||
\item If it falls below the threshold, merge it with the row immediately below it
|
||||
\item recalculate $\chi^{2}$, Only need to do this for rows adjacent to the recently merged one.
|
||||
\end{enumerate}
|
||||
|
||||
Large numbers of intervals does little to solve the problem of discretization.
|
||||
Just one interval cannot contribute to a decision making process.
|
||||
Modify significance level hypothesis of independence must pass, triggering interval merge.
|
||||
Set a minimum and a maximum number of intervals
|
||||
|
||||
\subsection{Entropy}
|
||||
\paragraph{Definition}
|
||||
Entropy is the measure of the presence of there being more than one possible classification.
|
||||
Used for splitting attributes in decision trees
|
||||
Entropy minimizes the complexity, number of branches, in the decision tree.
|
||||
No guarantee that using entropy will always lead to a small decision Tree
|
||||
Used for feature reduction: Calculate the value of information gain for each attribute in the original dataset. Discard all attributes that do not meet a specified criterion.
|
||||
Pass the revised dataset to the preferred classification algorithm
|
||||
|
||||
Entropy has bias towards selecting attributes with a large number of values
|
||||
\paragraph{Calculation}
|
||||
To decide if you split on an attribute:
|
||||
\begin{enumerate}
|
||||
\item find the entropy of the data in each of the branches after the split
|
||||
\item then take the average of those and use it to find information gain.
|
||||
\item The attribute split with the highest information gain (lowest entropy) is selected.
|
||||
\end{enumerate}
|
||||
|
||||
\begin{itemize}
|
||||
\item Entropy is always positive or zero
|
||||
\item Entropy is zero when $p_{i} = 1$, aka when all instances have the same class
|
||||
\item Entropy is at its max value for the # of classes when all classes are evenly distributed
|
||||
\end{itemize}
|
||||
|
||||
If there are classes, we can denote the proportion of instances with classification $i$ by $p_{i}$ for $i = 1 to K$.
|
||||
$p_{i} = \frac{\text{instances of class} i}{\text{total number of instances}}$
|
||||
|
||||
\begin{displaymath}
|
||||
\text{Entropy} = E = -\sum_{i=1}^{K} p_{i} log_{2} p_{i}
|
||||
\end{displaymath}
|
||||
|
||||
where $K =$ non-empty classes and $p_{i} = \frac{\lvert i \rvert}{N}$, instances in class $i$ over total number of instances $N$.
|
||||
|
||||
|
||||
\subsection{GINI}
|
||||
|
||||
\paragraph{Calculation}
|
||||
\begin{enumerate}
|
||||
\item For each non-empty column, form the sum of the squares of the values in the body of the table and divide by the column sum.
|
||||
\item Add the values obtained for all the columns and divide by N (the number of instances).
|
||||
\item Subtract the total from 1.
|
||||
\end{enumerate}
|
||||
|
||||
\subsection{Information Gain}
|
||||
\paragraph{Definition}
|
||||
The difference between the entropy before and after splitting on a given attribute in a decision tree.
|
||||
Maximizing information gain is the same as minimizing $E_{new}$.
|
||||
\paragraph{Calculation}
|
||||
|
||||
\begin{displaymath}
|
||||
\text{Information Gain} = E_{\text{start}} - E_{\text{new}}
|
||||
\end{displaymath}
|
||||
|
||||
Starting node:
|
||||
|
||||
\begin{eqnarray}
|
||||
E_{\text{start}} = -\frac{4}{24}log_{2}\frac{4}{24} \\ \nonumber
|
||||
-\frac{5}{24}log_{2}\frac{5}{24} \\ \nonumber
|
||||
-\frac{15}{24}log_{2}\frac{15}{24}
|
||||
\end{eqnarray}
|
||||
|
||||
After spliting on attribute:
|
||||
|
||||
\begin{eqnarray}
|
||||
E_{\text{new}} = \frac{8}{24}E_{1}\\ \nonumber
|
||||
+ \frac{8}{24}E_{2}\\ \nonumber
|
||||
+ \frac{8}{24}E_{3}\\ \nonumber
|
||||
\end{eqnarray}
|
||||
|
||||
|
||||
\paragraph{Uses}
|
||||
|
||||
|
||||
\section{Clustering}
|
||||
\paragraph{Definition}
|
||||
Grouping data into seperate groups.
|
||||
Use distance metric between two datapoints.
|
||||
Groups should be distinct from another and composed of items similar to one another, and different from items in other groups.
|
||||
|
||||
\subsection{Naïve Bayes}
|
||||
\begin{displaymath}
|
||||
P(c_{i} | v) = P(c_{i}) \prod_{j=1}^{n} P(a_{j} = v_{j} | \text{class} = c_{i})
|
||||
\end{displaymath}
|
||||
|
||||
\subsection{Nearest Neighbors}
|
||||
Mainly used when all attribute values are continuous
|
||||
|
||||
General strategy:
|
||||
\begin{enumerate}
|
||||
\item Find the $k$ training instances that are closest to the unseen instance.
|
||||
\item Take the most commonly occurring classification for these instances.
|
||||
\end{enumerate}
|
||||
|
||||
\begin{itemize}
|
||||
\item KMeans
|
||||
\item DBSCAN
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\section{Sequence Mining}
|
||||
\textbf{TODO}
|
||||
\paragraph{Definition}
|
||||
Finding meaningful, recurring sequences of events
|
||||
|
||||
|
||||
\section{Association Rule Analysis}
|
||||
\paragraph{Definition}
|
||||
Given a collection of collections (database of transactions of food items), find items with high co-occurance.
|
||||
|
||||
Let $m$ be the number possible items that can be bought.
|
||||
Let $I$ denote the set of all possible items.
|
||||
Possible itemsets: $s^{\lvert I \rvert}$
|
||||
An itemset $S$ matches a transaction $T$ (itself an itemset) if $S \subset T$.
|
||||
|
||||
\subsection{Support}
|
||||
|
||||
\paragraph{Definition}
|
||||
$support(S)$: proportion of itemsets matched by $S$.
|
||||
Proportion of transactions that contain all the items in $S$.
|
||||
Frequency with which the items in S occur together in the database.
|
||||
|
||||
\paragraph{Calculation}
|
||||
\begin{displaymath}
|
||||
\text{support}(S) = \frac{count(S)}{n}
|
||||
\end{displaymath}
|
||||
where n is the number of transactions in the database.
|
||||
|
||||
|
||||
\subsubsection{Uses}
|
||||
|
||||
\subsection{Confidence}
|
||||
|
||||
\paragraph{Calculation}
|
||||
Confidence of a rule can be calculated either by
|
||||
\begin{displaymath}
|
||||
Confidence(L \rightarrow R) = \frac{count(L \cup R)}{count(L)}
|
||||
\end{displaymath}
|
||||
or
|
||||
\begin{displaymath}
|
||||
Confidence(L \rightarrow R) = \frac{support(L \cup R)}{support(L)}
|
||||
\end{displaymath}
|
||||
Reject rules where
|
||||
\begin{displaymath}
|
||||
support < minsup \approx 0.01 = 1\%
|
||||
\end{displaymath}
|
||||
Also called a frequent|large|supported itemset.
|
||||
|
||||
Reject rules where
|
||||
\begin{displaymath}
|
||||
confidence < minconf \approx 0.8 = 80\%
|
||||
\end{displaymath}
|
||||
|
||||
\paragraph{Uses}
|
||||
|
||||
\subsection{Lift}
|
||||
\paragraph{Definition}
|
||||
Lift measures how many more times the items in and occur together than would be expected if they were statistically independent.
|
||||
\paragraph{Calculation}
|
||||
|
||||
\begin{eqnarray}
|
||||
\text{Lift}(L \rightarrow R)
|
||||
= \frac{\text{count}(L \cup R)}{\text{count}(L) \times \text{support}(R)} \nonumber
|
||||
\\
|
||||
= \frac{\text{support}(L \cup R)}{\text{support}(L) \times \text{support}(R)} \nonumber
|
||||
\\
|
||||
= \frac{\text{confidence}(L \rightarrow R)}{\text{support}(R)} \nonumber
|
||||
\\
|
||||
= \frac{N \times \text{confidence}(L \rightarrow R)}{\text{count}(R)} \nonumber
|
||||
\\
|
||||
= \frac{N \times \text{confidence}(R \rightarrow L)}{\text{count}(R)} \nonumber
|
||||
\\
|
||||
= \text{Lift}(R \rightarrow L)
|
||||
\end{eqnarray}
|
||||
|
||||
\paragraph{Uses}
|
||||
|
||||
\subsection{Frequent Itemsets}
|
||||
\begin{enumerate}
|
||||
\item Find itemsets of size $k$ made from 2 supported itemsets of size $k-1$
|
||||
\item For each new itemset:
|
||||
\begin{enumerate}
|
||||
\item check if every sub-itemset in it also exists in the supported itemsets of size $k - 1$.
|
||||
\item If not every sub-itemset does, then prune it
|
||||
\begin{enumerate}
|
||||
\item Now with the final candidates, determine if they have mininum support
|
||||
\item To determine association rules, find which itemsets have at least minimum confidence
|
||||
\end{enumerate}
|
||||
|
||||
|
||||
\subsection{Rules Possible}
|
||||
|
||||
\begin{displaymath}
|
||||
\sideset{_k}{_i}C
|
||||
\end{displaymath}
|
||||
|
||||
or
|
||||
|
||||
\begin{displaymath}
|
||||
2^{k} - 2
|
||||
\end{displaymath}
|
||||
|
||||
|
||||
\end{document}
|
||||
\endinput
|
Loading…
Reference in New Issue