java-projects/dna/DNA.java

import java.util.*;
import java.io.*;
import java.text.*;

public class DNA {

    int MIN_CODONS = 5; //the minimum number of codons a valid protein must have
    int MIN_MASS_PERCENT = 30; // the percentage of mass from C and G in order for a protein to be valid
    int UNIQUE_NUCLEOTIDES = 4; // the number of unique nucleotides (representing A, C, G, and T)
    public static final int CODONS_PER_NUCLEOTIDE = 3; // the number of nucleotides per codon
    public static final String INPUT_DIR = "input/"; // the number of nucleotides per codon
    public static final String OUTPUT_DIR = "output/"; // the number of nucleotides per codon
    public static final boolean DEBUG = false; // controls print statements for debugging
    public static final double[] MOLAR_MASSES = {135.128, 111.103, 151.128, 125.107, 100.00};
    public static final char[] NUCLEOTIDES = {'A', 'T', 'G', 'C', '-'};

    /*
    *
    * begins with an introduction and prompts for input and output file names.
    * reads the input file to process its nucleotide sequences.
    * outputs the results into the given output file.
    *
    * @return void
    *
    */
    public static void main(String[] args) throws FileNotFoundException {
        Scanner console = new Scanner(System.in);
        String[] files = promptForFileNames(console);
        File inFile = new File(INPUT_DIR + files[0]);
        File outFile = new File(OUTPUT_DIR + files[1]);
        if( inFile.canRead() && inFile.exists() ) {
            Scanner input = new Scanner(inFile);

            int sequenceCount = linesInFile(inFile) / 2;

            String[] sequences = new String[sequenceCount];
            String[] sequenceNames = new String[sequenceCount];

            int lineCount = 1;
            int sequenceIndex = 0;
            while( input.hasNextLine() ) {
                String line = input.nextLine();
                if ( lineCount % 2 == 0 ) {
                    sequences[sequenceIndex] = line.toUpperCase();
                    sequenceIndex++;
                }
                else {
                    sequenceNames[sequenceIndex] = line;
                }
                lineCount++;
            }

            PrintStream output = new PrintStream(outFile);
            saveOutput(sequences, sequenceNames, output);
            //output = System.out;
            //saveOutput(sequences, sequenceNames, output);
        }
    }

    /*
    *
    * Saves output to file using PrintStream as described in Section 6.4 of the textbook.
    * overwrites any existing data in the output file (this is the default PrintStream behavior).
    * nucleotide sequence is output in uppercase
    * the nucleotide counts and mass percentages are shown in A, C, G, T order.
    *
    * @params   path    String path of output filename
    * @return   void;
    *
    */
    public static void saveOutput(String[] sequences, String[] names, PrintStream output) {

        for( int i = 0; i < sequences.length; i++) {
            output.print("Region Name: ");
            output.println(names[i]);
            output.print("Nucleotides: ");
            output.println(sequences[i]);
            output.print("Nuc. Counts: ");
            output.println(Arrays.toString(nucleotideCountFromSequence(sequences[i])));
            output.print("Total Mass%: ");
            output.print(Arrays.toString(nucleotideMassPercentFromSequence(sequences[i])));
            output.println(" of "  + totalMassFromSequence(sequences[i]));
            output.print("Codons List: ");
            output.println(Arrays.toString(codonsFromSequence(sequences[i])));
            output.print("Is Protein?: ");
            if(isProtein(sequences[i])) {
                output.println("YES");
            }
            else {
                output.println("NO");
            }
            output.println();
        }
    }
    // return boolean test for protein sequence
    public static boolean isProtein(String sequence) {
        double[] masses = nucleotideMassPercentFromSequence(sequence);
        if(masses[nucleotideIndexFromChar('G')] + masses[nucleotideIndexFromChar('C')] <= 30 )
        {
            return false;
        }
        String[] codons = codonsFromSequence(sequence);
        if( ! codons[0].equals("ATG"))
        {
            return false;
        }
        if(! codons[codons.length - 1].equals("TAA") && ! codons[codons.length - 1].equals("TAG") && ! codons[codons.length - 1].equals("TGA"))
        {
            return false;
        }
        if(codons.length < 5)
        {
            return false;
        }
        return true;
    }
    // non junk masses from sequence
    public static double[] nucleotideMassesFromSequence(String sequence) {
        double[] masses = new double[4];
        int[] counts = nucleotideCountFromSequence(sequence);
        DecimalFormat df = new DecimalFormat("#.##");
        for(int i = 0; i < masses.length; i++) {
           masses[i] = Double.valueOf(df.format(counts[i] * MOLAR_MASSES[i]));
        }
        return masses;
    }
    // non junk mass percentages from sequence
    public static double[] nucleotideMassPercentFromSequence(String sequence) {
        double[] masses = nucleotideMassesFromSequence(sequence);
        double total = totalMassFromSequence(sequence);
        double[] percentages = new double[4];
        DecimalFormat df = new DecimalFormat("#.##");
        for(int i = 0; i < percentages.length; i++) {
            percentages[i] = Double.valueOf(df.format(masses[i] / total * 100));
        }
        return percentages;
    }
    // all mass of sequence, junk too
    public static double totalMassFromSequence(String sequence) {
        DecimalFormat df = new DecimalFormat("#.#");
        double total = 0.0;

        double[] masses = new double[MOLAR_MASSES.length];
        int[] counts = new int[MOLAR_MASSES.length];
        for(int i = 0; i < counts.length; i++) {
           counts[i] = nucleotideCount(sequence, NUCLEOTIDES[i]);
        }
        for(int i = 0; i < masses.length; i++) {
           masses[i] = Double.valueOf(df.format(counts[i] * MOLAR_MASSES[i]));
        }
        for(int i = 0; i < masses.length; i++) {
            total += masses[i];
        }
        return Double.valueOf(df.format(total));
    }
    // count of char nulceotide in sequence
    public static int nucleotideCount(String sequence, char nucleotide) {
        int count = 0;
        for(int i = 0; i < sequence.length(); i++) {
            if( sequence.charAt(i) == nucleotide) {
                count++;
            }
        }
        return count;
    }

    /*
    *
    * Converts char nucleotide to molar mass
    *
    * mass percentages, use the following as the mass of each nucleotide (grams/mol).
    * "junk" regions are excluded from many parts of your computations, but they do contribute mass to the total.
    * Adenine (A): 135.128
    * Cytosine (C): 111.103
    * Guanine (G): 151.128
    * Thymine (T): 125.107
    * Junk (-): 100.000
    *
    * @params   nucleotide  char of nucleotide.
    * @return   molarMass   rounded to nearest tenth.
    *
    */
    public static double nucleotideToMolarMass(char nucleotide) {
        return MOLAR_MASSES[nucleotideIndexFromChar(nucleotide)];
    }

    /*
    *
    * pass over a nucleotide sequence and count the number of As, Cs, Gs, and Ts.
    * uses a String.charAt to get individual characters.
    *
    * @params   sequence    sequence of
    * @return   counts    array of size 4.
    *
    */
    public static int[] nucleotideCountFromSequence(String sequence) {

        int[] counts = new int[4];
        for(int i = 0; i < sequence.length(); i++) {
            counts[nucleotideIndexFromChar(sequence.charAt(i))]++;
        }
        return counts;

    }

    /*
    *
    * converts a single character (i.e. A, C, T, G) into indices (i.e. 0 to 3).
    * includes junk character
    *
    * @params   nucleotide  char of nucleotide.
    * @return   index   int mapping of nucleotide.
    *
    */
    public static int nucleotideIndexFromChar(char nucleotide) {
        int index = 0;
        if(nucleotide == 'a' || nucleotide =='A') {
            index = 0;
        }
        if(nucleotide == 'c' || nucleotide =='C') {
            index = 1;
        }
        if(nucleotide == 'g' || nucleotide =='G') {
            index = 2;
        }
        if(nucleotide == 't' || nucleotide =='T') {
            index = 3;
        }
        if(nucleotide == '-') {
            index = 4;
        }
        return index;
    }

    /*
    *
    * eliminate these characters.
    *
    * @params   sequence    raw sequence
    * @return   sequence    without junk DNA '-'.
    *
    */
    public static String removeJunk(String sequence) {
        String newSequence = "";
        for(int i = 0; i < sequence.length(); i++) {
            if(sequence.charAt(i) != '-') {
                newSequence += sequence.charAt(i);
            }
        }
        return newSequence;

    }

    /*
    *
    * break apart the sequence into codons and examine each codon.
    *
    * @params   sequence    String of nucleotides or raw.
    * @return   codons  Array of strings of codons.
    *
    */
    public static String[] codonsFromSequence(String sequence) {
        int count = sequence.length() / CODONS_PER_NUCLEOTIDE;
        String[] codons = new String[count];
        //String[] codons = new String[sequence.length / CODONS_PER_NUCLEOTIDE + 1];
        String current = "";
        int index = 0;
        for(int i = 0; i < sequence.length(); i++) {
            current += sequence.charAt(i);
            if((i + 1) % CODONS_PER_NUCLEOTIDE == 0 ) {
                codons[index] = current;
            index++;
                current = "";
            }
        }
        return codons;

    }

    /*
    *
    * assumes the user will type the name of an existing input file.
    * input is in the proper format.
    *
    * @params   console
    * @return   files   array of input and output files for processing and saving
    *
    */
    public static String[] promptForFileNames(Scanner console) {
        System.out.println("This program reports information about DNA");
        System.out.println("nucleotide sequences that may encode");
        System.out.println("proteins.");

        if(DEBUG == true) {
            String[] files = {"dna.txt", "dna.txt"};
            return files;
        }

        System.out.print("Input file name? ");
        String[] files = new String[2];
        files[0] = console.nextLine();
        while( ! validFilename(files[0]) ) {
            System.out.print("Input file name? ");
            files[0] = console.nextLine();
        }
        System.out.print("Output file name? ");
        files[1] = console.nextLine();
        while( ! validFilename(files[1]) ) {
            System.out.println("Invalid. Try again.");
            System.out.print("Output file name?");
            files[1] = console.nextLine();
        }
        System.out.println();
        return files;

    }
    //count of lines in file
    public static int linesInFile(File file) throws FileNotFoundException {
        int count = 1;
        Scanner input = new Scanner(file);
        while( input.hasNextLine() ) {
            String line = input.nextLine();
            count++;
        }
        return count;
    }

    //all files are valid
    public static boolean validFilename(String name) {
        return true;
    }
}