diff --git a/dist/final_notecard.pdf b/dist/final_notecard.pdf
index e2d6b47..622b802 100644
Binary files a/dist/final_notecard.pdf and b/dist/final_notecard.pdf differ
diff --git a/docs/final_notecard.tex b/docs/final_notecard.tex
new file mode 100644
index 0000000..99f9d93
--- /dev/null
+++ b/docs/final_notecard.tex
@@ -0,0 +1,432 @@
+\documentclass[sigconf,authorversion,nonacm]{acmart}
+
+\usepackage{bbold}
+\usepackage{geometry}
+\geometry{margin=0.3in}
+
+\nonstopmode
+
+\begin{document}
+\title{CSCI577 Final}
+
+\maketitle
+
+\section{Data Types}
+\begin{table*}
+    \caption{data types}
+    \begin{tabular}{lll}
+        \toprule
+        Variable type & Description & Examples \\
+        \midrule
+        Categorical & & \\
+        Nominal (unordered) & Gives only qualitative information & Names, occupations, nationalities, sex, religion\\
+        Ordinal (ordered) & Ranking or order is important & Social status, economic class \\
+        Numeric & & \\
+        Interval & Distance between values has meaning (discrete or continuous) & Year, temperature \\
+        Ratio & Ratio of two values has meaning & Wealth, age, prices, wages \\
+        \bottomrule
+    \end{tabular}
+\end{table*}
+
+\section{Math}
+
+\begin{displaymath}
+    log_{b}a = \frac{log_{x}a}{log_{x}b}
+\end{displaymath}
+
+\section{Classification}
+\paragraph{Definition}
+Constructing a method of classifying new instances using information in a training set
+
+\begin{itemize}
+    \item Naive Bayes (Conditional Probabilities)
+    \item Decision Trees
+    \item Logistic Regression
+    \item Neural Networks
+\end{itemize}
+
+\subsection{Naïve Bayes}
+Bayes Theorem:
+\begin{displaymath}
+    P(A|B) = \frac{P(A,B)}{P(B)} = \frac{P(B|A) \times P(A}{P(B)}
+\end{displaymath}
+
+\paragraph{Calculation}
+Probablity of a class given attributes is a product of probability of that class overall, with the sum product of each individual attribute given the class:
+\begin{displaymath}
+    P(c_{i} | v) = P(c_{i}) \prod_{j=1}^{n} P(a_{j} = v_{j} | \text{class} = c_{i})
+\end{displaymath}
+
+
+\subsection{TDIDT}
+\paragraph{Definition}
+Top-Down Induction of Decision Trees
+
+\paragraph{Algorithm}
+Until no more splitting is possible:
+\begin{itemize}
+    \item IF all the instances in the training set belong to the same class THEN return the value of the class
+    \item ELSE 
+    \begin{enumerate}
+        \item (a) Select an attribute A to split on
+        \item (b) Sort the instances in the training set into subsets, one for each value of attribute A
+        \item (c) Return a tree with one branch for each non-empty subset
+            \begin{itemize}
+                \item Each branch having a descendant subtree or a class value produced by applying the algorithm recursively
+            \end{itemize}
+    \end{enumerate}
+\end{itemize}
+
+
+\subsection{Adequacy}
+\paragraph{Definition}
+No two instances with the same values of all the attributes may belong to different classes.
+Naive bayes can still be used when this doesn't obtain, as it will still be able to obtain the probabilities of each class.
+kNN can still be used, as long as then multiple datapoints in the same location in euclidean space would still function as expected.
+
+\subsection{Overfitting}
+Understand the concept of overfitting and be able to tell how you would know that a classification system overfit
+\paragraph{Definition} 
+If classifier generates a decision tree (or other mechanism) too well adapted to the training set
+Performs well on training set, not well on other data.
+Some overfitting inevitable.
+
+Remedy:
+\begin{itemize}
+    \item Adjust a decision tree while it is being generated: Pre-pruning
+    \item Modify the tree after creation: Post-pruning
+\end{itemize}
+
+\subsubsection{Clashes}
+Two (or more) instances of a training set have identical attribute values, but different classification.
+Especially a problem for TDIDT's 'Adequacy condition'.
+
+\paragraph{Stems from}
+\begin{itemize}
+    \item Classification incorrectly recorded
+    \item Recorded attributes insufficient - Would need more attributes, normally impossible
+\end{itemize}
+
+\paragraph{Solutions}
+\begin{itemize}
+    \item Discard the branch to the clashing node from the node above
+    \item Of clashing instances, assume the majority label
+\end{itemize}
+
+\subsubsection{Prepruning}
+When pre-pruning, may reduce accuracy on training set but may be better on test set (and subsequent) data than unpruned classifier.
+\begin{enumerate}
+    \item test whether a termination condition applies.
+        \begin{itemize}
+            \item If so, current subset is treated as a 'clash set'
+            \item Resolve by 'delete branch,' 'majority voting,' etc.
+        \end{itemize}
+    \item two methods methods
+        \begin{itemize}
+            \item Size Cuttoff – prune of subset has fewer than X instances
+            \item Maximum depth: prune if length of branch exceed Y
+        \end{itemize}
+\end{enumerate}
+
+\subsubsection{PostPruning}
+\begin{enumerate}
+    \item look for a non-leaf nodes that have descendants of length 1.
+    \item In this tree, only node G and D are candidates for pruning (consolidation).
+\end{enumerate}
+
+
+\subsection{Discretizing}
+
+\subsubsection{Equal Width Intervals}
+\subsubsection{Pseudo Attributes}
+\subsubsection{Processing Sorted Instance Table}
+\subsubsection{ChiMerge}
+
+\paragraph{Rationalization}
+Initially, each distinct value of a numerical attribute $A$ is considered to be one interval.
+$\chi^{2}$ tests are performed for every pair of adjacent intervals.
+Adjacent intervals with the least $\chi^{2}$ values are merged together, because $\chi^{2}$ low values for a pair indicates similar class distributions.
+This merging process proceeds recursively until a predefined stopping criterion is met.
+For two adjacent intervals, if $\chi^{2}$ test concludes that the class is independent intervals should be merged.
+If $\chi^{2}$ test concludes that they are not independent, i.e. the difference in relative class frequency is statistically significant, the two intervals should remain separate.
+
+\paragraph{Calculation}
+To calculate expected value for any combination of row and class:
+\begin{enumerate}
+    \item Take the product of the corresponding row sum and column sum
+    \item Divided by the grand total of the observed values for the two rows.
+\end{enumerate}
+
+Then:
+\begin{enumerate}
+    \item Using observed and expected values, calculate, for each of the cells:
+    \begin{math}
+        \frac{(O - E)^{2}}{E}
+    \end{math}
+    \item Sum each cell's $\chi^{2}$
+\end{enumerate}
+When exceeds $\chi^{2}$ threshold, hypothesis is rejected.
+Small value for supports hypothesis.
+Important adjustment, when $E < 0.5$ replace it with $0.5$.
+\begin{enumerate}
+    \item Select the smallest value
+    \item Compare it to the threshold
+    \item If it falls below the threshold, merge it with the row immediately below it
+    \item recalculate $\chi^{2}$, Only need to do this for rows adjacent to the recently merged one.
+\end{enumerate}
+
+Large numbers of intervals does little to solve the problem of discretization.
+Just one interval cannot contribute to a decision making process.
+Modify significance level hypothesis of independence must pass, triggering interval merge.
+Set a minimum and a maximum number of intervals
+
+\subsection{Entropy}
+\paragraph{Definition}
+Entropy is the measure of the presence of there being more than one possible classification.
+Used for splitting attributes in decision trees
+Entropy minimizes the complexity, number of branches, in the decision tree.
+No guarantee that using entropy will always lead to a small decision Tree
+Used for feature reduction: Calculate the value of information gain for each attribute in the original dataset. Discard all attributes that do not meet a specified criterion.
+Pass the revised dataset to the preferred classification algorithm
+
+Entropy has bias towards selecting attributes with a large number of values
+\paragraph{Calculation}
+To decide if you split on an attribute: 
+\begin{enumerate}
+    \item find the entropy of the data in each of the branches after the split
+    \item then take the average of those and use it to find information gain.
+    \item The attribute split with the highest information gain (lowest entropy) is selected.
+\end{enumerate}
+
+\begin{itemize}
+    \item Entropy is always positive or zero
+    \item Entropy is zero when $p_{i} = 1$, aka when all instances have the same class
+    \item Entropy is at its max value for the \# of classes when all classes are evenly distributed
+\end{itemize}
+
+If there are classes, we can denote the proportion of instances with classification $i$ by $p_{i}$ for $i = 1 to K$.
+$p_{i} = \frac{\text{instances of class} i}{\text{total number of instances}}$
+
+\begin{displaymath}
+    \text{Entropy} = E = -\sum_{i=1}^{K} p_{i} log_{2} p_{i}
+\end{displaymath}
+
+where $K =$ non-empty classes and $p_{i} = \frac{\lvert i \rvert}{N}$, instances in class $i$ over total number of instances $N$.
+
+
+\subsection{GINI}
+
+\paragraph{Calculation}
+\begin{enumerate}
+    \item For each non-empty column, form the sum of the squares of the values in the body of the table and divide by the column sum.
+    \item Add the values obtained for all the columns and divide by N (the number of instances).
+    \item Subtract the total from 1.
+\end{enumerate}
+
+\subsection{Information Gain}
+\paragraph{Definition}
+The difference between the entropy before and after splitting on a given attribute in a decision tree.
+Maximizing information gain is the same as minimizing $E_{new}$.
+\paragraph{Calculation}
+
+\begin{displaymath}
+    \text{Information Gain} = E_{\text{start}} - E_{\text{new}}
+\end{displaymath}
+
+Starting node:
+
+\begin{eqnarray}
+    E_{\text{start}} = -\frac{4}{24}log_{2}\frac{4}{24} \\ \nonumber
+    -\frac{5}{24}log_{2}\frac{5}{24} \\ \nonumber
+    -\frac{15}{24}log_{2}\frac{15}{24}
+\end{eqnarray}
+
+After spliting on attribute:
+
+\begin{eqnarray}
+    E_{\text{new}} = \frac{8}{24}E_{1}\\ \nonumber
+    + \frac{8}{24}E_{2}\\ \nonumber
+    + \frac{8}{24}E_{3}\\ \nonumber
+\end{eqnarray}
+
+
+\paragraph{Uses}
+
+
+\section{Clustering}
+\paragraph{Definition}
+Grouping data into seperate groups.
+Use distance metric between two datapoints. 
+Groups should be distinct from another and composed of items similar to one another, and different from items in other groups.
+
+\subsection{Nearest Neighbors}
+Mainly used when all attribute values are continuous
+
+General strategy:
+\begin{enumerate}
+    \item Find the $k$ training instances that are closest to the unseen instance.
+    \item Take the most commonly occurring classification for these instances.
+\end{enumerate}
+
+\begin{itemize}
+    \item KMeans
+    \item DBSCAN
+\end{itemize}
+
+
+\section{Sequence Mining}
+\textbf{TODO}
+\paragraph{Definition}
+Finding meaningful, recurring sequences of events
+
+\begin{itemize}
+    \item A sequence is an ordered list of elements (transactions):
+        \begin{displaymath}
+            s = <e_1 e_2 e_3>
+        \end{displaymath}
+    \item Each element contains a collection of events (items):
+        \begin{displaymath}
+            e_i = {i_1 i_2 i_3 \cdots i_k}
+        \end{displaymath}
+    \item Each element is attributed to a specific time or location.
+    \item Length of a sequence, $\lvert s \rvert$, is given by the number of elements of the sequence.
+    \item A k-sequence is a sequence that contains k events (items)
+\end{itemize}
+
+\paragraph{Contains}
+A sequence \begin{math} <a_1 a_2 \cdots a_n> \end{math} 
+is contained in another sequence \begin{math}<b_1 b_2 \cdots b_m>; (m \geq n)\end{math} 
+if there exist integers \begin{math} i_1 < i_2 < \cdots < i_n \end{math} 
+such that \begin{math} a_1 \subseteq b_{i1} a_2 \subseteq b_{i2} \cdots a_n \subseteq b_{in} \end{math}.
+
+\paragraph{Support}
+The support of a subsequence w is defined as the fraction of data sequences that contain w.
+A sequential pattern is a frequent subsequence (i.e., a subsequence where $\text{support} \geq \text{minsup}$)
+
+\subsection{Generalized Sequential Pattern}
+\begin{enumerate}
+    \item Make the first pass over the sequence database D to yield all the 1-element frequent sequences
+    \item Repeat until no new frequent sequences are found:
+    \begin{enumerate}
+        \item Candidate Generation: Merge pairs of frequent subsequences found in the $(k-1)$'th pass to generate candidate sequences that contain $k$ items
+        \item Initial Pruning: Prune if it is not the case that all of the $k-1$ subsequences of a $k$ sequence are frequent
+        \item Support Counting: Make a new pass over the sequence database $D$ to find the support for these candidate sequences
+        \item Candidate Elimination: Eliminate candidate k-sequences whose actual support is less than minsup
+    \end{enumerate}
+\end{enumerate}
+
+\subsection{Counting Methods}
+\begin{itemize}
+    \item COBJ: One occurrence per object
+    \item CWIN: One occurrence per sliding window
+    \item CMINWIN: Number of minimal windows of occurrence
+    \item CDIST 0: Distinct occurrences with possibility of event-timestamp overlap
+    \item CDIST: Distinct occurrences with no event- timestamp overlap allowed
+\end{itemize}
+
+
+\section{Association Rule Analysis}
+\paragraph{Definition}
+Given a collection of collections (database of transactions of food items), find items with high co-occurance.
+
+Let $m$ be the number possible items that can be bought.
+Let $I$ denote the set of all possible items.
+Possible itemsets: $s^{\lvert I \rvert}$
+An itemset $S$ matches a transaction $T$ (itself an itemset) if $S \subset T$.
+
+\subsection{Support}
+
+\paragraph{Definition}
+$support(S)$: proportion of itemsets matched by $S$. 
+Proportion of transactions that contain all the items in $S$. 
+Frequency with which the items in S occur together in the database.
+
+\paragraph{Calculation}
+\begin{displaymath}
+    \text{support}(S) = \frac{count(S)}{n}
+\end{displaymath}
+where n is the number of transactions in the database.
+
+
+\subsubsection{Uses}
+
+\subsection{Confidence}
+
+\paragraph{Calculation}
+Confidence of a rule can be calculated either by
+\begin{displaymath}
+    Confidence(L \rightarrow R) = \frac{count(L \cup R)}{count(L)}
+\end{displaymath}
+or 
+\begin{displaymath}
+    Confidence(L \rightarrow R) = \frac{support(L \cup R)}{support(L)}
+\end{displaymath}
+Reject rules where 
+\begin{displaymath}
+    support < minsup \approx 0.01 = 1\%
+\end{displaymath}
+Also called a frequent|large|supported itemset.
+
+Reject rules where 
+\begin{displaymath}
+    confidence < minconf \approx 0.8 = 80\%
+\end{displaymath}
+
+\paragraph{Uses}
+
+\subsection{Lift}
+\paragraph{Definition}
+Lift measures how many more times the items in and occur together than would be expected if they were statistically independent.
+Although lift is a useful measure of interestingness it is not always the best one to use.
+In some cases a rule with higher support and lower lift can be more interesting than one with lower support and higher lift because it applies to more cases.
+\paragraph{Calculation}
+
+\begin{eqnarray}
+    \text{Lift}(L \rightarrow R) 
+    = \frac{\text{count}(L \cup R)}{\text{count}(L) \times \text{support}(R)} \nonumber
+    \\
+    = \frac{\text{support}(L \cup R)}{\text{support}(L) \times \text{support}(R)}  \nonumber
+    \\
+    = \frac{\text{confidence}(L \rightarrow R)}{\text{support}(R)} \nonumber
+    \\
+    = \frac{N \times \text{confidence}(L \rightarrow R)}{\text{count}(R)} \nonumber
+    \\
+    = \frac{N \times \text{confidence}(R \rightarrow L)}{\text{count}(R)} \nonumber
+    \\
+    = \text{Lift}(R \rightarrow L)
+\end{eqnarray}
+
+\subsection{Leverage}
+\paragraph{Calculation}
+\begin{displaymath}
+    \text{leverage}(L \rightarrow R) = \text{support}(L \cup R) - \text{support}(L) \times \text{support}(R)
+\end{displaymath}
+
+\subsection{Frequent Itemsets}
+\begin{enumerate}
+    \item Find itemsets of size $k$ made from 2 supported itemsets of size $k-1$
+    \item For each new itemset: 
+        \begin{enumerate}
+            \item check if every sub-itemset in it also exists in the supported itemsets of size $k - 1$. 
+            \item If not every sub-itemset does, then prune it
+        \end{enumerate}
+    \item Now with the final candidates, determine if they have mininum support
+    \item To determine association rules, find which itemsets have at least minimum confidence
+\end{enumerate}
+
+
+\subsection{Rules Possible}
+
+\begin{displaymath}
+    \sideset{_k}{_i}C
+\end{displaymath}
+
+or
+
+\begin{displaymath}
+    2^{k} - 2
+\end{displaymath}
+
+
+\end{document}
+\endinput