333 lines
11 KiB
Java
333 lines
11 KiB
Java
|
import java.util.*;
|
||
|
import java.io.*;
|
||
|
import java.text.*;
|
||
|
|
||
|
public class DNA {
|
||
|
|
||
|
int MIN_CODONS = 5; //the minimum number of codons a valid protein must have
|
||
|
int MIN_MASS_PERCENT = 30; // the percentage of mass from C and G in order for a protein to be valid
|
||
|
int UNIQUE_NUCLEOTIDES = 4; // the number of unique nucleotides (representing A, C, G, and T)
|
||
|
public static final int CODONS_PER_NUCLEOTIDE = 3; // the number of nucleotides per codon
|
||
|
public static final String INPUT_DIR = "input/"; // the number of nucleotides per codon
|
||
|
public static final String OUTPUT_DIR = "output/"; // the number of nucleotides per codon
|
||
|
public static final boolean DEBUG = false; // controls print statements for debugging
|
||
|
public static final double[] MOLAR_MASSES = {135.128, 111.103, 151.128, 125.107, 100.00};
|
||
|
public static final char[] NUCLEOTIDES = {'A', 'T', 'G', 'C', '-'};
|
||
|
|
||
|
/*
|
||
|
*
|
||
|
* begins with an introduction and prompts for input and output file names.
|
||
|
* reads the input file to process its nucleotide sequences.
|
||
|
* outputs the results into the given output file.
|
||
|
*
|
||
|
* @return void
|
||
|
*
|
||
|
*/
|
||
|
public static void main(String[] args) throws FileNotFoundException {
|
||
|
Scanner console = new Scanner(System.in);
|
||
|
String[] files = promptForFileNames(console);
|
||
|
File inFile = new File(INPUT_DIR + files[0]);
|
||
|
File outFile = new File(OUTPUT_DIR + files[1]);
|
||
|
if( inFile.canRead() && inFile.exists() ) {
|
||
|
Scanner input = new Scanner(inFile);
|
||
|
|
||
|
int sequenceCount = linesInFile(inFile) / 2;
|
||
|
|
||
|
String[] sequences = new String[sequenceCount];
|
||
|
String[] sequenceNames = new String[sequenceCount];
|
||
|
|
||
|
int lineCount = 1;
|
||
|
int sequenceIndex = 0;
|
||
|
while( input.hasNextLine() ) {
|
||
|
String line = input.nextLine();
|
||
|
if ( lineCount % 2 == 0 ) {
|
||
|
sequences[sequenceIndex] = line.toUpperCase();
|
||
|
sequenceIndex++;
|
||
|
}
|
||
|
else {
|
||
|
sequenceNames[sequenceIndex] = line;
|
||
|
}
|
||
|
lineCount++;
|
||
|
}
|
||
|
|
||
|
PrintStream output = new PrintStream(outFile);
|
||
|
saveOutput(sequences, sequenceNames, output);
|
||
|
//output = System.out;
|
||
|
//saveOutput(sequences, sequenceNames, output);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
*
|
||
|
* Saves output to file using PrintStream as described in Section 6.4 of the textbook.
|
||
|
* overwrites any existing data in the output file (this is the default PrintStream behavior).
|
||
|
* nucleotide sequence is output in uppercase
|
||
|
* the nucleotide counts and mass percentages are shown in A, C, G, T order.
|
||
|
*
|
||
|
* @params path String path of output filename
|
||
|
* @return void;
|
||
|
*
|
||
|
*/
|
||
|
public static void saveOutput(String[] sequences, String[] names, PrintStream output) {
|
||
|
|
||
|
for( int i = 0; i < sequences.length; i++) {
|
||
|
output.print("Region Name: ");
|
||
|
output.println(names[i]);
|
||
|
output.print("Nucleotides: ");
|
||
|
output.println(sequences[i]);
|
||
|
output.print("Nuc. Counts: ");
|
||
|
output.println(Arrays.toString(nucleotideCountFromSequence(sequences[i])));
|
||
|
output.print("Total Mass%: ");
|
||
|
output.print(Arrays.toString(nucleotideMassPercentFromSequence(sequences[i])));
|
||
|
output.println(" of " + totalMassFromSequence(sequences[i]));
|
||
|
output.print("Codons List: ");
|
||
|
output.println(Arrays.toString(codonsFromSequence(sequences[i])));
|
||
|
output.print("Is Protein?: ");
|
||
|
if(isProtein(sequences[i])) {
|
||
|
output.println("YES");
|
||
|
}
|
||
|
else {
|
||
|
output.println("NO");
|
||
|
}
|
||
|
output.println();
|
||
|
}
|
||
|
}
|
||
|
// return boolean test for protein sequence
|
||
|
public static boolean isProtein(String sequence) {
|
||
|
double[] masses = nucleotideMassPercentFromSequence(sequence);
|
||
|
if(masses[nucleotideIndexFromChar('G')] + masses[nucleotideIndexFromChar('C')] <= 30 )
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
String[] codons = codonsFromSequence(sequence);
|
||
|
if( ! codons[0].equals("ATG"))
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
if(! codons[codons.length - 1].equals("TAA") && ! codons[codons.length - 1].equals("TAG") && ! codons[codons.length - 1].equals("TGA"))
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
if(codons.length < 5)
|
||
|
{
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
// non junk masses from sequence
|
||
|
public static double[] nucleotideMassesFromSequence(String sequence) {
|
||
|
double[] masses = new double[4];
|
||
|
int[] counts = nucleotideCountFromSequence(sequence);
|
||
|
DecimalFormat df = new DecimalFormat("#.##");
|
||
|
for(int i = 0; i < masses.length; i++) {
|
||
|
masses[i] = Double.valueOf(df.format(counts[i] * MOLAR_MASSES[i]));
|
||
|
}
|
||
|
return masses;
|
||
|
}
|
||
|
// non junk mass percentages from sequence
|
||
|
public static double[] nucleotideMassPercentFromSequence(String sequence) {
|
||
|
double[] masses = nucleotideMassesFromSequence(sequence);
|
||
|
double total = totalMassFromSequence(sequence);
|
||
|
double[] percentages = new double[4];
|
||
|
DecimalFormat df = new DecimalFormat("#.##");
|
||
|
for(int i = 0; i < percentages.length; i++) {
|
||
|
percentages[i] = Double.valueOf(df.format(masses[i] / total * 100));
|
||
|
}
|
||
|
return percentages;
|
||
|
}
|
||
|
// all mass of sequence, junk too
|
||
|
public static double totalMassFromSequence(String sequence) {
|
||
|
DecimalFormat df = new DecimalFormat("#.#");
|
||
|
double total = 0.0;
|
||
|
|
||
|
double[] masses = new double[MOLAR_MASSES.length];
|
||
|
int[] counts = new int[MOLAR_MASSES.length];
|
||
|
for(int i = 0; i < counts.length; i++) {
|
||
|
counts[i] = nucleotideCount(sequence, NUCLEOTIDES[i]);
|
||
|
}
|
||
|
for(int i = 0; i < masses.length; i++) {
|
||
|
masses[i] = Double.valueOf(df.format(counts[i] * MOLAR_MASSES[i]));
|
||
|
}
|
||
|
for(int i = 0; i < masses.length; i++) {
|
||
|
total += masses[i];
|
||
|
}
|
||
|
return Double.valueOf(df.format(total));
|
||
|
}
|
||
|
// count of char nulceotide in sequence
|
||
|
public static int nucleotideCount(String sequence, char nucleotide) {
|
||
|
int count = 0;
|
||
|
for(int i = 0; i < sequence.length(); i++) {
|
||
|
if( sequence.charAt(i) == nucleotide) {
|
||
|
count++;
|
||
|
}
|
||
|
}
|
||
|
return count;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
*
|
||
|
* Converts char nucleotide to molar mass
|
||
|
*
|
||
|
* mass percentages, use the following as the mass of each nucleotide (grams/mol).
|
||
|
* "junk" regions are excluded from many parts of your computations, but they do contribute mass to the total.
|
||
|
* Adenine (A): 135.128
|
||
|
* Cytosine (C): 111.103
|
||
|
* Guanine (G): 151.128
|
||
|
* Thymine (T): 125.107
|
||
|
* Junk (-): 100.000
|
||
|
*
|
||
|
* @params nucleotide char of nucleotide.
|
||
|
* @return molarMass rounded to nearest tenth.
|
||
|
*
|
||
|
*/
|
||
|
public static double nucleotideToMolarMass(char nucleotide) {
|
||
|
return MOLAR_MASSES[nucleotideIndexFromChar(nucleotide)];
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
*
|
||
|
* pass over a nucleotide sequence and count the number of As, Cs, Gs, and Ts.
|
||
|
* uses a String.charAt to get individual characters.
|
||
|
*
|
||
|
* @params sequence sequence of
|
||
|
* @return counts array of size 4.
|
||
|
*
|
||
|
*/
|
||
|
public static int[] nucleotideCountFromSequence(String sequence) {
|
||
|
|
||
|
int[] counts = new int[4];
|
||
|
for(int i = 0; i < sequence.length(); i++) {
|
||
|
counts[nucleotideIndexFromChar(sequence.charAt(i))]++;
|
||
|
}
|
||
|
return counts;
|
||
|
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
*
|
||
|
* converts a single character (i.e. A, C, T, G) into indices (i.e. 0 to 3).
|
||
|
* includes junk character
|
||
|
*
|
||
|
* @params nucleotide char of nucleotide.
|
||
|
* @return index int mapping of nucleotide.
|
||
|
*
|
||
|
*/
|
||
|
public static int nucleotideIndexFromChar(char nucleotide) {
|
||
|
int index = 0;
|
||
|
if(nucleotide == 'a' || nucleotide =='A') {
|
||
|
index = 0;
|
||
|
}
|
||
|
if(nucleotide == 'c' || nucleotide =='C') {
|
||
|
index = 1;
|
||
|
}
|
||
|
if(nucleotide == 'g' || nucleotide =='G') {
|
||
|
index = 2;
|
||
|
}
|
||
|
if(nucleotide == 't' || nucleotide =='T') {
|
||
|
index = 3;
|
||
|
}
|
||
|
if(nucleotide == '-') {
|
||
|
index = 4;
|
||
|
}
|
||
|
return index;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
*
|
||
|
* eliminate these characters.
|
||
|
*
|
||
|
* @params sequence raw sequence
|
||
|
* @return sequence without junk DNA '-'.
|
||
|
*
|
||
|
*/
|
||
|
public static String removeJunk(String sequence) {
|
||
|
String newSequence = "";
|
||
|
for(int i = 0; i < sequence.length(); i++) {
|
||
|
if(sequence.charAt(i) != '-') {
|
||
|
newSequence += sequence.charAt(i);
|
||
|
}
|
||
|
}
|
||
|
return newSequence;
|
||
|
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
*
|
||
|
* break apart the sequence into codons and examine each codon.
|
||
|
*
|
||
|
* @params sequence String of nucleotides or raw.
|
||
|
* @return codons Array of strings of codons.
|
||
|
*
|
||
|
*/
|
||
|
public static String[] codonsFromSequence(String sequence) {
|
||
|
int count = sequence.length() / CODONS_PER_NUCLEOTIDE;
|
||
|
String[] codons = new String[count];
|
||
|
//String[] codons = new String[sequence.length / CODONS_PER_NUCLEOTIDE + 1];
|
||
|
String current = "";
|
||
|
int index = 0;
|
||
|
for(int i = 0; i < sequence.length(); i++) {
|
||
|
current += sequence.charAt(i);
|
||
|
if((i + 1) % CODONS_PER_NUCLEOTIDE == 0 ) {
|
||
|
codons[index] = current;
|
||
|
index++;
|
||
|
current = "";
|
||
|
}
|
||
|
}
|
||
|
return codons;
|
||
|
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
*
|
||
|
* assumes the user will type the name of an existing input file.
|
||
|
* input is in the proper format.
|
||
|
*
|
||
|
* @params console
|
||
|
* @return files array of input and output files for processing and saving
|
||
|
*
|
||
|
*/
|
||
|
public static String[] promptForFileNames(Scanner console) {
|
||
|
System.out.println("This program reports information about DNA");
|
||
|
System.out.println("nucleotide sequences that may encode");
|
||
|
System.out.println("proteins.");
|
||
|
|
||
|
if(DEBUG == true) {
|
||
|
String[] files = {"dna.txt", "dna.txt"};
|
||
|
return files;
|
||
|
}
|
||
|
|
||
|
System.out.print("Input file name? ");
|
||
|
String[] files = new String[2];
|
||
|
files[0] = console.nextLine();
|
||
|
while( ! validFilename(files[0]) ) {
|
||
|
System.out.print("Input file name? ");
|
||
|
files[0] = console.nextLine();
|
||
|
}
|
||
|
System.out.print("Output file name? ");
|
||
|
files[1] = console.nextLine();
|
||
|
while( ! validFilename(files[1]) ) {
|
||
|
System.out.println("Invalid. Try again.");
|
||
|
System.out.print("Output file name?");
|
||
|
files[1] = console.nextLine();
|
||
|
}
|
||
|
System.out.println();
|
||
|
return files;
|
||
|
|
||
|
}
|
||
|
//count of lines in file
|
||
|
public static int linesInFile(File file) throws FileNotFoundException {
|
||
|
int count = 1;
|
||
|
Scanner input = new Scanner(file);
|
||
|
while( input.hasNextLine() ) {
|
||
|
String line = input.nextLine();
|
||
|
count++;
|
||
|
}
|
||
|
return count;
|
||
|
}
|
||
|
|
||
|
//all files are valid
|
||
|
public static boolean validFilename(String name) {
|
||
|
return true;
|
||
|
}
|
||
|
}
|