add data types, logs, sequence mining to notecard.
This commit is contained in:
commit
38c49616a2
Binary file not shown.
|
@ -0,0 +1,432 @@
|
||||||
|
\documentclass[sigconf,authorversion,nonacm]{acmart}
|
||||||
|
|
||||||
|
\usepackage{bbold}
|
||||||
|
\usepackage{geometry}
|
||||||
|
\geometry{margin=0.3in}
|
||||||
|
|
||||||
|
\nonstopmode
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\title{CSCI577 Final}
|
||||||
|
|
||||||
|
\maketitle
|
||||||
|
|
||||||
|
\section{Data Types}
|
||||||
|
\begin{table*}
|
||||||
|
\caption{data types}
|
||||||
|
\begin{tabular}{lll}
|
||||||
|
\toprule
|
||||||
|
Variable type & Description & Examples \\
|
||||||
|
\midrule
|
||||||
|
Categorical & & \\
|
||||||
|
Nominal (unordered) & Gives only qualitative information & Names, occupations, nationalities, sex, religion\\
|
||||||
|
Ordinal (ordered) & Ranking or order is important & Social status, economic class \\
|
||||||
|
Numeric & & \\
|
||||||
|
Interval & Distance between values has meaning (discrete or continuous) & Year, temperature \\
|
||||||
|
Ratio & Ratio of two values has meaning & Wealth, age, prices, wages \\
|
||||||
|
\bottomrule
|
||||||
|
\end{tabular}
|
||||||
|
\end{table*}
|
||||||
|
|
||||||
|
\section{Math}
|
||||||
|
|
||||||
|
\begin{displaymath}
|
||||||
|
log_{b}a = \frac{log_{x}a}{log_{x}b}
|
||||||
|
\end{displaymath}
|
||||||
|
|
||||||
|
\section{Classification}
|
||||||
|
\paragraph{Definition}
|
||||||
|
Constructing a method of classifying new instances using information in a training set
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item Naive Bayes (Conditional Probabilities)
|
||||||
|
\item Decision Trees
|
||||||
|
\item Logistic Regression
|
||||||
|
\item Neural Networks
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsection{Naïve Bayes}
|
||||||
|
Bayes Theorem:
|
||||||
|
\begin{displaymath}
|
||||||
|
P(A|B) = \frac{P(A,B)}{P(B)} = \frac{P(B|A) \times P(A}{P(B)}
|
||||||
|
\end{displaymath}
|
||||||
|
|
||||||
|
\paragraph{Calculation}
|
||||||
|
Probablity of a class given attributes is a product of probability of that class overall, with the sum product of each individual attribute given the class:
|
||||||
|
\begin{displaymath}
|
||||||
|
P(c_{i} | v) = P(c_{i}) \prod_{j=1}^{n} P(a_{j} = v_{j} | \text{class} = c_{i})
|
||||||
|
\end{displaymath}
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{TDIDT}
|
||||||
|
\paragraph{Definition}
|
||||||
|
Top-Down Induction of Decision Trees
|
||||||
|
|
||||||
|
\paragraph{Algorithm}
|
||||||
|
Until no more splitting is possible:
|
||||||
|
\begin{itemize}
|
||||||
|
\item IF all the instances in the training set belong to the same class THEN return the value of the class
|
||||||
|
\item ELSE
|
||||||
|
\begin{enumerate}
|
||||||
|
\item (a) Select an attribute A to split on
|
||||||
|
\item (b) Sort the instances in the training set into subsets, one for each value of attribute A
|
||||||
|
\item (c) Return a tree with one branch for each non-empty subset
|
||||||
|
\begin{itemize}
|
||||||
|
\item Each branch having a descendant subtree or a class value produced by applying the algorithm recursively
|
||||||
|
\end{itemize}
|
||||||
|
\end{enumerate}
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{Adequacy}
|
||||||
|
\paragraph{Definition}
|
||||||
|
No two instances with the same values of all the attributes may belong to different classes.
|
||||||
|
Naive bayes can still be used when this doesn't obtain, as it will still be able to obtain the probabilities of each class.
|
||||||
|
kNN can still be used, as long as then multiple datapoints in the same location in euclidean space would still function as expected.
|
||||||
|
|
||||||
|
\subsection{Overfitting}
|
||||||
|
Understand the concept of overfitting and be able to tell how you would know that a classification system overfit
|
||||||
|
\paragraph{Definition}
|
||||||
|
If classifier generates a decision tree (or other mechanism) too well adapted to the training set
|
||||||
|
Performs well on training set, not well on other data.
|
||||||
|
Some overfitting inevitable.
|
||||||
|
|
||||||
|
Remedy:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Adjust a decision tree while it is being generated: Pre-pruning
|
||||||
|
\item Modify the tree after creation: Post-pruning
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsubsection{Clashes}
|
||||||
|
Two (or more) instances of a training set have identical attribute values, but different classification.
|
||||||
|
Especially a problem for TDIDT's 'Adequacy condition'.
|
||||||
|
|
||||||
|
\paragraph{Stems from}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Classification incorrectly recorded
|
||||||
|
\item Recorded attributes insufficient - Would need more attributes, normally impossible
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\paragraph{Solutions}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Discard the branch to the clashing node from the node above
|
||||||
|
\item Of clashing instances, assume the majority label
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsubsection{Prepruning}
|
||||||
|
When pre-pruning, may reduce accuracy on training set but may be better on test set (and subsequent) data than unpruned classifier.
|
||||||
|
\begin{enumerate}
|
||||||
|
\item test whether a termination condition applies.
|
||||||
|
\begin{itemize}
|
||||||
|
\item If so, current subset is treated as a 'clash set'
|
||||||
|
\item Resolve by 'delete branch,' 'majority voting,' etc.
|
||||||
|
\end{itemize}
|
||||||
|
\item two methods methods
|
||||||
|
\begin{itemize}
|
||||||
|
\item Size Cuttoff – prune of subset has fewer than X instances
|
||||||
|
\item Maximum depth: prune if length of branch exceed Y
|
||||||
|
\end{itemize}
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\subsubsection{PostPruning}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item look for a non-leaf nodes that have descendants of length 1.
|
||||||
|
\item In this tree, only node G and D are candidates for pruning (consolidation).
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{Discretizing}
|
||||||
|
|
||||||
|
\subsubsection{Equal Width Intervals}
|
||||||
|
\subsubsection{Pseudo Attributes}
|
||||||
|
\subsubsection{Processing Sorted Instance Table}
|
||||||
|
\subsubsection{ChiMerge}
|
||||||
|
|
||||||
|
\paragraph{Rationalization}
|
||||||
|
Initially, each distinct value of a numerical attribute $A$ is considered to be one interval.
|
||||||
|
$\chi^{2}$ tests are performed for every pair of adjacent intervals.
|
||||||
|
Adjacent intervals with the least $\chi^{2}$ values are merged together, because $\chi^{2}$ low values for a pair indicates similar class distributions.
|
||||||
|
This merging process proceeds recursively until a predefined stopping criterion is met.
|
||||||
|
For two adjacent intervals, if $\chi^{2}$ test concludes that the class is independent intervals should be merged.
|
||||||
|
If $\chi^{2}$ test concludes that they are not independent, i.e. the difference in relative class frequency is statistically significant, the two intervals should remain separate.
|
||||||
|
|
||||||
|
\paragraph{Calculation}
|
||||||
|
To calculate expected value for any combination of row and class:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Take the product of the corresponding row sum and column sum
|
||||||
|
\item Divided by the grand total of the observed values for the two rows.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
Then:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Using observed and expected values, calculate, for each of the cells:
|
||||||
|
\begin{math}
|
||||||
|
\frac{(O - E)^{2}}{E}
|
||||||
|
\end{math}
|
||||||
|
\item Sum each cell's $\chi^{2}$
|
||||||
|
\end{enumerate}
|
||||||
|
When exceeds $\chi^{2}$ threshold, hypothesis is rejected.
|
||||||
|
Small value for supports hypothesis.
|
||||||
|
Important adjustment, when $E < 0.5$ replace it with $0.5$.
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Select the smallest value
|
||||||
|
\item Compare it to the threshold
|
||||||
|
\item If it falls below the threshold, merge it with the row immediately below it
|
||||||
|
\item recalculate $\chi^{2}$, Only need to do this for rows adjacent to the recently merged one.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
Large numbers of intervals does little to solve the problem of discretization.
|
||||||
|
Just one interval cannot contribute to a decision making process.
|
||||||
|
Modify significance level hypothesis of independence must pass, triggering interval merge.
|
||||||
|
Set a minimum and a maximum number of intervals
|
||||||
|
|
||||||
|
\subsection{Entropy}
|
||||||
|
\paragraph{Definition}
|
||||||
|
Entropy is the measure of the presence of there being more than one possible classification.
|
||||||
|
Used for splitting attributes in decision trees
|
||||||
|
Entropy minimizes the complexity, number of branches, in the decision tree.
|
||||||
|
No guarantee that using entropy will always lead to a small decision Tree
|
||||||
|
Used for feature reduction: Calculate the value of information gain for each attribute in the original dataset. Discard all attributes that do not meet a specified criterion.
|
||||||
|
Pass the revised dataset to the preferred classification algorithm
|
||||||
|
|
||||||
|
Entropy has bias towards selecting attributes with a large number of values
|
||||||
|
\paragraph{Calculation}
|
||||||
|
To decide if you split on an attribute:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item find the entropy of the data in each of the branches after the split
|
||||||
|
\item then take the average of those and use it to find information gain.
|
||||||
|
\item The attribute split with the highest information gain (lowest entropy) is selected.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item Entropy is always positive or zero
|
||||||
|
\item Entropy is zero when $p_{i} = 1$, aka when all instances have the same class
|
||||||
|
\item Entropy is at its max value for the \# of classes when all classes are evenly distributed
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
If there are classes, we can denote the proportion of instances with classification $i$ by $p_{i}$ for $i = 1 to K$.
|
||||||
|
$p_{i} = \frac{\text{instances of class} i}{\text{total number of instances}}$
|
||||||
|
|
||||||
|
\begin{displaymath}
|
||||||
|
\text{Entropy} = E = -\sum_{i=1}^{K} p_{i} log_{2} p_{i}
|
||||||
|
\end{displaymath}
|
||||||
|
|
||||||
|
where $K =$ non-empty classes and $p_{i} = \frac{\lvert i \rvert}{N}$, instances in class $i$ over total number of instances $N$.
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{GINI}
|
||||||
|
|
||||||
|
\paragraph{Calculation}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item For each non-empty column, form the sum of the squares of the values in the body of the table and divide by the column sum.
|
||||||
|
\item Add the values obtained for all the columns and divide by N (the number of instances).
|
||||||
|
\item Subtract the total from 1.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\subsection{Information Gain}
|
||||||
|
\paragraph{Definition}
|
||||||
|
The difference between the entropy before and after splitting on a given attribute in a decision tree.
|
||||||
|
Maximizing information gain is the same as minimizing $E_{new}$.
|
||||||
|
\paragraph{Calculation}
|
||||||
|
|
||||||
|
\begin{displaymath}
|
||||||
|
\text{Information Gain} = E_{\text{start}} - E_{\text{new}}
|
||||||
|
\end{displaymath}
|
||||||
|
|
||||||
|
Starting node:
|
||||||
|
|
||||||
|
\begin{eqnarray}
|
||||||
|
E_{\text{start}} = -\frac{4}{24}log_{2}\frac{4}{24} \\ \nonumber
|
||||||
|
-\frac{5}{24}log_{2}\frac{5}{24} \\ \nonumber
|
||||||
|
-\frac{15}{24}log_{2}\frac{15}{24}
|
||||||
|
\end{eqnarray}
|
||||||
|
|
||||||
|
After spliting on attribute:
|
||||||
|
|
||||||
|
\begin{eqnarray}
|
||||||
|
E_{\text{new}} = \frac{8}{24}E_{1}\\ \nonumber
|
||||||
|
+ \frac{8}{24}E_{2}\\ \nonumber
|
||||||
|
+ \frac{8}{24}E_{3}\\ \nonumber
|
||||||
|
\end{eqnarray}
|
||||||
|
|
||||||
|
|
||||||
|
\paragraph{Uses}
|
||||||
|
|
||||||
|
|
||||||
|
\section{Clustering}
|
||||||
|
\paragraph{Definition}
|
||||||
|
Grouping data into seperate groups.
|
||||||
|
Use distance metric between two datapoints.
|
||||||
|
Groups should be distinct from another and composed of items similar to one another, and different from items in other groups.
|
||||||
|
|
||||||
|
\subsection{Nearest Neighbors}
|
||||||
|
Mainly used when all attribute values are continuous
|
||||||
|
|
||||||
|
General strategy:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Find the $k$ training instances that are closest to the unseen instance.
|
||||||
|
\item Take the most commonly occurring classification for these instances.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item KMeans
|
||||||
|
\item DBSCAN
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
|
||||||
|
\section{Sequence Mining}
|
||||||
|
\textbf{TODO}
|
||||||
|
\paragraph{Definition}
|
||||||
|
Finding meaningful, recurring sequences of events
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item A sequence is an ordered list of elements (transactions):
|
||||||
|
\begin{displaymath}
|
||||||
|
s = <e_1 e_2 e_3>
|
||||||
|
\end{displaymath}
|
||||||
|
\item Each element contains a collection of events (items):
|
||||||
|
\begin{displaymath}
|
||||||
|
e_i = {i_1 i_2 i_3 \cdots i_k}
|
||||||
|
\end{displaymath}
|
||||||
|
\item Each element is attributed to a specific time or location.
|
||||||
|
\item Length of a sequence, $\lvert s \rvert$, is given by the number of elements of the sequence.
|
||||||
|
\item A k-sequence is a sequence that contains k events (items)
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\paragraph{Contains}
|
||||||
|
A sequence \begin{math} <a_1 a_2 \cdots a_n> \end{math}
|
||||||
|
is contained in another sequence \begin{math}<b_1 b_2 \cdots b_m>; (m \geq n)\end{math}
|
||||||
|
if there exist integers \begin{math} i_1 < i_2 < \cdots < i_n \end{math}
|
||||||
|
such that \begin{math} a_1 \subseteq b_{i1} a_2 \subseteq b_{i2} \cdots a_n \subseteq b_{in} \end{math}.
|
||||||
|
|
||||||
|
\paragraph{Support}
|
||||||
|
The support of a subsequence w is defined as the fraction of data sequences that contain w.
|
||||||
|
A sequential pattern is a frequent subsequence (i.e., a subsequence where $\text{support} \geq \text{minsup}$)
|
||||||
|
|
||||||
|
\subsection{Generalized Sequential Pattern}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Make the first pass over the sequence database D to yield all the 1-element frequent sequences
|
||||||
|
\item Repeat until no new frequent sequences are found:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Candidate Generation: Merge pairs of frequent subsequences found in the $(k-1)$'th pass to generate candidate sequences that contain $k$ items
|
||||||
|
\item Initial Pruning: Prune if it is not the case that all of the $k-1$ subsequences of a $k$ sequence are frequent
|
||||||
|
\item Support Counting: Make a new pass over the sequence database $D$ to find the support for these candidate sequences
|
||||||
|
\item Candidate Elimination: Eliminate candidate k-sequences whose actual support is less than minsup
|
||||||
|
\end{enumerate}
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\subsection{Counting Methods}
|
||||||
|
\begin{itemize}
|
||||||
|
\item COBJ: One occurrence per object
|
||||||
|
\item CWIN: One occurrence per sliding window
|
||||||
|
\item CMINWIN: Number of minimal windows of occurrence
|
||||||
|
\item CDIST 0: Distinct occurrences with possibility of event-timestamp overlap
|
||||||
|
\item CDIST: Distinct occurrences with no event- timestamp overlap allowed
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
|
||||||
|
\section{Association Rule Analysis}
|
||||||
|
\paragraph{Definition}
|
||||||
|
Given a collection of collections (database of transactions of food items), find items with high co-occurance.
|
||||||
|
|
||||||
|
Let $m$ be the number possible items that can be bought.
|
||||||
|
Let $I$ denote the set of all possible items.
|
||||||
|
Possible itemsets: $s^{\lvert I \rvert}$
|
||||||
|
An itemset $S$ matches a transaction $T$ (itself an itemset) if $S \subset T$.
|
||||||
|
|
||||||
|
\subsection{Support}
|
||||||
|
|
||||||
|
\paragraph{Definition}
|
||||||
|
$support(S)$: proportion of itemsets matched by $S$.
|
||||||
|
Proportion of transactions that contain all the items in $S$.
|
||||||
|
Frequency with which the items in S occur together in the database.
|
||||||
|
|
||||||
|
\paragraph{Calculation}
|
||||||
|
\begin{displaymath}
|
||||||
|
\text{support}(S) = \frac{count(S)}{n}
|
||||||
|
\end{displaymath}
|
||||||
|
where n is the number of transactions in the database.
|
||||||
|
|
||||||
|
|
||||||
|
\subsubsection{Uses}
|
||||||
|
|
||||||
|
\subsection{Confidence}
|
||||||
|
|
||||||
|
\paragraph{Calculation}
|
||||||
|
Confidence of a rule can be calculated either by
|
||||||
|
\begin{displaymath}
|
||||||
|
Confidence(L \rightarrow R) = \frac{count(L \cup R)}{count(L)}
|
||||||
|
\end{displaymath}
|
||||||
|
or
|
||||||
|
\begin{displaymath}
|
||||||
|
Confidence(L \rightarrow R) = \frac{support(L \cup R)}{support(L)}
|
||||||
|
\end{displaymath}
|
||||||
|
Reject rules where
|
||||||
|
\begin{displaymath}
|
||||||
|
support < minsup \approx 0.01 = 1\%
|
||||||
|
\end{displaymath}
|
||||||
|
Also called a frequent|large|supported itemset.
|
||||||
|
|
||||||
|
Reject rules where
|
||||||
|
\begin{displaymath}
|
||||||
|
confidence < minconf \approx 0.8 = 80\%
|
||||||
|
\end{displaymath}
|
||||||
|
|
||||||
|
\paragraph{Uses}
|
||||||
|
|
||||||
|
\subsection{Lift}
|
||||||
|
\paragraph{Definition}
|
||||||
|
Lift measures how many more times the items in and occur together than would be expected if they were statistically independent.
|
||||||
|
Although lift is a useful measure of interestingness it is not always the best one to use.
|
||||||
|
In some cases a rule with higher support and lower lift can be more interesting than one with lower support and higher lift because it applies to more cases.
|
||||||
|
\paragraph{Calculation}
|
||||||
|
|
||||||
|
\begin{eqnarray}
|
||||||
|
\text{Lift}(L \rightarrow R)
|
||||||
|
= \frac{\text{count}(L \cup R)}{\text{count}(L) \times \text{support}(R)} \nonumber
|
||||||
|
\\
|
||||||
|
= \frac{\text{support}(L \cup R)}{\text{support}(L) \times \text{support}(R)} \nonumber
|
||||||
|
\\
|
||||||
|
= \frac{\text{confidence}(L \rightarrow R)}{\text{support}(R)} \nonumber
|
||||||
|
\\
|
||||||
|
= \frac{N \times \text{confidence}(L \rightarrow R)}{\text{count}(R)} \nonumber
|
||||||
|
\\
|
||||||
|
= \frac{N \times \text{confidence}(R \rightarrow L)}{\text{count}(R)} \nonumber
|
||||||
|
\\
|
||||||
|
= \text{Lift}(R \rightarrow L)
|
||||||
|
\end{eqnarray}
|
||||||
|
|
||||||
|
\subsection{Leverage}
|
||||||
|
\paragraph{Calculation}
|
||||||
|
\begin{displaymath}
|
||||||
|
\text{leverage}(L \rightarrow R) = \text{support}(L \cup R) - \text{support}(L) \times \text{support}(R)
|
||||||
|
\end{displaymath}
|
||||||
|
|
||||||
|
\subsection{Frequent Itemsets}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Find itemsets of size $k$ made from 2 supported itemsets of size $k-1$
|
||||||
|
\item For each new itemset:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item check if every sub-itemset in it also exists in the supported itemsets of size $k - 1$.
|
||||||
|
\item If not every sub-itemset does, then prune it
|
||||||
|
\end{enumerate}
|
||||||
|
\item Now with the final candidates, determine if they have mininum support
|
||||||
|
\item To determine association rules, find which itemsets have at least minimum confidence
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{Rules Possible}
|
||||||
|
|
||||||
|
\begin{displaymath}
|
||||||
|
\sideset{_k}{_i}C
|
||||||
|
\end{displaymath}
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
\begin{displaymath}
|
||||||
|
2^{k} - 2
|
||||||
|
\end{displaymath}
|
||||||
|
|
||||||
|
|
||||||
|
\end{document}
|
||||||
|
\endinput
|
Loading…
Reference in New Issue