import java.awt.Desktop; import java.io.BufferedReader; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Arrays; import java.util.Scanner; import java.util.logging.Level; import java.util.logging.Logger; /* * Click nbfs://nbhost/SystemFileSystem/Templates/Licenses/license-default.txt to change this license * Click nbfs://nbhost/SystemFileSystem/Templates/Classes/Main.java to edit this template */ /** * * @author devincamenares */ public class snpSeq { public static void main(String[] args) throws IOException, InterruptedException { /** * A timestamp, used for generating unique file IDs */ final long timeUnique = System.currentTimeMillis(); /** * A directory name, created based upon timestamp. */ String dirName = Long.toString(timeUnique); // THIS RETRIEVES DATABASE INFORMATION int snpDBcount = 0; URL url0 = new URL("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?db=snp"); URLConnection con0 = url0.openConnection(); try (InputStream is0 = con0.getInputStream()) { BufferedReader br0 = new BufferedReader(new InputStreamReader(is0)); /** * String container for nucleotide lines. */ String line0 = null; // read each line and write to file while ((line0 = br0.readLine()) != null && snpDBcount == 0) { if(line0.contains("")) { line0 = line0.replaceAll("[^\\d.]", ""); snpDBcount = Integer.parseInt(line0); } } is0.close(); } // THIS BLOCK IS FOR GETTING USER INPUT System.out.println("This program is designed to scan sections of the SNP database and report back sequences that flank particular signle base pair substitutions."); System.out.println("Let's Begin - Choose your first nucleotide - A, C, G, or T"); Scanner scanner = new Scanner(System.in); boolean choiceGood = false; String userFirstChoice = ""; String userSecondChoice = ""; while(!choiceGood){ userFirstChoice = scanner.nextLine(); userFirstChoice = userFirstChoice.toUpperCase(); if(userFirstChoice.equals("A") || userFirstChoice.equals("G") || userFirstChoice.equals("C") || userFirstChoice.equals("T")){ choiceGood = true; } else { System.out.println("Hmm... you picked " + userFirstChoice + ". That doesn't seem right - it needs to be an A, T, C, or G. Try again!"); } } System.out.println("Excellent. Now, what will it mutate to? Pick A, T, C, or G, but not the same as before! It could also be - to indicate a deletion"); choiceGood = false; while(!choiceGood) { userSecondChoice = scanner.nextLine(); userSecondChoice = userSecondChoice.toUpperCase(); if(!userSecondChoice.equals(userFirstChoice)) { if(userSecondChoice.equals("A") || userSecondChoice.equals("G") || userSecondChoice.equals("C") || userSecondChoice.equals("T") || userSecondChoice.equals("-")){ choiceGood = true; } else { System.out.println("Hmm... you picked " + userSecondChoice + ". That doesn't seem right - it needs to be an A, T, C, G, or -. Try again!"); } } else { System.out.println("Whoops! You picked the same nucleotide as before (" + userFirstChoice + ">" + userSecondChoice + "). Try again!"); } } String repSuggest = "N"; String[] theChoices = {userFirstChoice, userSecondChoice}; Arrays.sort(theChoices); if(!userSecondChoice.equals("-")) { switch(theChoices[0]+theChoices[1]){ case "AT": repSuggest = "W"; break; case "AG": repSuggest = "R"; break; case "AC": repSuggest = "M"; break; case "CT": repSuggest = "Y"; break; case "CG": repSuggest = "S"; break; case "GT": repSuggest = "K"; break; } } else { repSuggest = "-"; } System.out.println("Now, when retrieving the sequence, what should I use to represent the mutated residue? Based on your mutation choice, I recommend a " + repSuggest + ". You can leave this blank to use the recommendation"); String representMut = userDefault(repSuggest); System.out.println("Thanks, processing..."); System.out.println("Your original nucleotide was " + userFirstChoice + " and this mutated to a " + userSecondChoice + ". This will be represented in the final result as a " + representMut); System.out.println("******************************************"); System.out.println("What search terms do you want to use to filter results? For the entire database, type all[sb] or leave blank by just hitting enter."); String searchTerm = userDefault("all[sb]"); System.out.println("What entry # from the search do you want to begin at? Please enter a number. Specify 'RANDOM' or leave as default for a scattershot across the database"); String retStartInput = userDefault("RANDOM"); int retStart = 1; if(isInt(retStartInput)) { retStart = Integer.parseInt(retStartInput); } System.out.println("Now, how many total entries do you want to scan? Please enter a number (Default is 100000. The database has approximately " + snpDBcount + " entries)"); int retMaxT = userDefault(100000); if((retStart + retMaxT)>snpDBcount) { retStart = snpDBcount - (retMaxT+1); System.out.println("Based on that choice, the starting point has been adjusted to " + retStart + " so that it will not exceed the size of the entire database"); } System.out.println("In what size batches do you want to scan for entries? Please enter a number (Default is set to previous answer, cannot be greater than this or 9999)"); int retMaxB = userDefaultMax(retMaxT, Math.min(retMaxT, 9999)); System.out.println("OK, what range of flanking sequences do you want - please enter a number (Default is 100)"); int flank = userDefault(100); System.out.println("NCBI doesn't like frequent requests to their server. By how many miliseconds should I delay each nucleotide request? Please enter a number - 350 is recommended, skip to keep this default value"); int timeDelay = userDefault(350); System.out.println("NCBI also doesn't like retrieving too many entries from a single URL. What size do you want each batch to be? Please enter a number, at or less than 200 is recommended, skip to keep this default value."); int batchSize = userDefault(200); System.out.println("This program could take awhile to run. Approximately how often, in seconds, would you like an update on the progress? The default value is 180 - or about three minutes"); int progressReport = userDefault(180); System.out.println("Do you want to save the unique identifiers for each SNP entry in a separate file? Type YES to get this file, or anything else (default) to skip this."); String saveIDs = scanner.nextLine(); saveIDs = saveIDs.toUpperCase(); System.out.println("Finally, do you want to save, as a separate file, the unique identifier for each mutation scanner - regardless if it matches the desired mutation? This will give you an idea of what other mutations are analyzed. Type YES to get this file, or anything else (default) to skip this."); String saveSPDIs = scanner.nextLine(); saveSPDIs = saveSPDIs.toUpperCase(); String inputSummary = "Great! I will search using " + searchTerm + " to look for " + retMaxT + " entries in batches of " + retMaxB; if(retStartInput.equals("RANDOM")) { inputSummary += "at random starting points "; } else{ inputSummary += "starting at result " + retStart; } inputSummary += " that match a mutation of " + userFirstChoice + " to " + userSecondChoice + ". If I find any, I'll wait " + timeDelay + " miliseconds and then report back " + flank + "bp flanking either side of the mutation, which will be indicated as " + representMut + "." + System.lineSeparator(); System.out.println(inputSummary + "*** Here we go! ***" + System.lineSeparator()); // BLOCK TO SETUP WRITING TO FILE /** * The directory created by the program, wherein files are written File dir = new File(dirName); dir.mkdir(); This doesn't work on mac for some reason. * */ /** * String used to hold pathnames */ String pathName = dirName + "_" + userFirstChoice + "_" + userSecondChoice + "_" + retStart + "_" + retMaxT; /** * File to which runtime information is written upon program conclusion. */ File file1 = new File(pathName + ".txt"); writeFile(inputSummary + System.lineSeparator(), file1); // BLOCK TO PROCESS REQUEST String ids = ""; int entriesSearched = 0; int mutationsQueried = 0; int sequencesCollected = 0; ArrayList multipleMut = new ArrayList<>(); ArrayList idsBatches = new ArrayList<>(); int batchCounter = 0; long timeUpdate = System.currentTimeMillis(); int updateCounter = 0; //BLOCK FOR 1ST WEBSITE CALL, DIRECTED START int retBatch = retMaxT/retMaxB; int retRemain = retMaxT - (retMaxB * retBatch); ArrayList retArr = new ArrayList<>(); for (int i = 0; i < retBatch; i++) { int nextSeq = 0; if(i > 0) { nextSeq = 1; } int[] retVal = {(retStart + (retMaxB * i)) + nextSeq, retMaxB}; retArr.add(retVal); } if(retRemain > 0) { int[] retVal2 = {(retMaxB * retBatch) + 1, retRemain - 1}; retArr.add(retVal2); } System.out.println("Created coordinatres for " + retBatch + " batches of searches, with an additional " + retRemain + " entries leftover, for a total of " + retArr.size() + " iterations in the first step"); for (int x = 0; x < retArr.size(); x++) { // This short block of code provides the user with regular updates, every 5 minutes. long timeExpired = (System.currentTimeMillis() - timeUpdate)/1000; if (timeExpired > progressReport) { updateCounter++; System.out.println("It's been about " + Math.round((progressReport/60)*updateCounter) + " minutes, and so far I have examined " + x + " batches of entries to find their mutation identifier."); timeUpdate = System.currentTimeMillis(); System.out.println("Now, back to work...."); } int retStartX = retArr.get(x)[0]; //FOR RANDOM retStart if(retStartInput.equals("RANDOM")) { retStartX = (int)((snpDBcount - retMaxB)*Math.random()); } int retMaxX = retArr.get(x)[1]; // System.out.println("Iteration " + x + ", now searching database starting at entry " + retStartX + " to " + (retStartX + retMaxX) + " and retrieving unique identifiers"); // Pause Execution - cannot be more than 3 every second. Thread.sleep(timeDelay); /** * E-utility URL, built from searchTerm (all) and retMax and retStart */ URL url1 = new URL("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=snp&term=" + searchTerm + "&retmax=" + retMaxX + "&retstart=" + retStartX); URLConnection con1 = url1.openConnection(); try (InputStream is1 = con1.getInputStream()) { BufferedReader br1 = new BufferedReader(new InputStreamReader(is1)); /** * String container for nucleotide lines. */ String line1 = null; // read each line and write to file while ((line1 = br1.readLine()) != null) { if(line1.contains("")) { //line1 = line1.replace("", ""); //line1 = line1.replace("", ""); line1 = line1.replaceAll("[^\\d.]", ""); entriesSearched++; batchCounter++; //writeFile(line1 + System.lineSeparator(), file1); ids += line1 + ","; ids = ids.trim(); if(batchCounter == batchSize) { ids = ids.substring(0, ids.length() - 1); idsBatches.add(ids); batchCounter = 0; ids = ""; } } } is1.close(); } catch (java.io.IOException e){ System.out.println("Encountered a problem after searching " + entriesSearched + " entries. Exception: " + e); continue; } if(batchCounter != 0) { ids = ids.substring(0, ids.length() - 1); idsBatches.add(ids); }; } // BLOCK 1.5 - extra step to save results of first part if(saveIDs.equals("YES")) { File file2 = new File(pathName + "_IDs.txt"); for (int h = 0; h < idsBatches.size(); h++) { String idSave = idsBatches.get(h); if(h < idsBatches.size()-1) { idSave += ","; } writeFile(idSave, file2); } } String savedSPDI = ""; // BLOCK FOR 2ND WEBSITE //ArrayList nucID = new ArrayList(); //ArrayList nucPos = new ArrayList(); //ArrayList orgNt = new ArrayList(); //ArrayList newNt = new ArrayList(); ArrayList spdiArray = new ArrayList(); System.out.println("Entry search complete, scanned " + entriesSearched + " entries and grouped them into " + idsBatches.size() + " batches. Now proceeding to mutation identification and filtering, followed by sequence extraction!"); for (int j = 0; j < idsBatches.size(); j++) { //System.out.println(idsBatches.get(j)); // This short block of code provides the user with regular updates, every 5 minutes. long timeExpired = (System.currentTimeMillis() - timeUpdate)/1000; if (timeExpired > progressReport && j > 0) { updateCounter++; System.out.println("It's been about " + Math.round((progressReport/60)*updateCounter) + " minutes, and so far I have collected " + sequencesCollected + " sequences by checking " + mutationsQueried + " mutations, having completed analysis of " + (j-1) + " batches each with " + batchSize + " entries. (For " + (j-1)*batchSize + " entries analyzed out of the total of " + entriesSearched + ", or about " + (100*(j-1)*batchSize)/entriesSearched + "% complete)"); timeUpdate = System.currentTimeMillis(); System.out.println("Now, back to work...."); } // Pause Execution - cannot be more than 3 every second. Thread.sleep(timeDelay); /** * E-utility URL, built from searchTerm (all) and retMax and retStart */ URL url2 = new URL("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&id=" + idsBatches.get(j)); URLConnection con2 = url2.openConnection(); try (InputStream is2 = con2.getInputStream()) { BufferedReader br2 = new BufferedReader(new InputStreamReader(is2)); String line2 = null; String spdiIndex = null; int lineCounter = 0; while ((line2 = br2.readLine()) != null) { lineCounter++; if(line2.contains("") && line2.contains("") && (line2.length() > 14) && line2.contains(":")) { // System.out.println(line2); line2 = line2.replace("", ""); line2 = line2.replace("", ""); line2 = line2.trim(); if(saveSPDIs.equals("YES")) { savedSPDI += line2; savedSPDI += System.lineSeparator(); } String[] spdiX = line2.split(","); // Keeps track of how many have multiple mutations if(spdiX.length > multipleMut.size()) { multipleMut.add(1); } else { multipleMut.set(spdiX.length-1, multipleMut.get(spdiX.length-1)+1); } for (int i = 0; i < spdiX.length; i++) { mutationsQueried++; /* The next few lines are to catch any SPDI mutation entries that go to a deletion, which is a blank in the current system */ String[] spdiInfoTemp = spdiX[i].split(":"); String[] spdiInfo = new String[4]; try{ spdiInfo[0] = spdiInfoTemp[0]; spdiInfo[1] = spdiInfoTemp[1]; spdiInfo[2] = spdiInfoTemp[2]; } catch(ArrayIndexOutOfBoundsException e) { System.out.println("Encountered error in batch " + j + " with parsing SPDI information with the following line: " + spdiX[i] + " is from an array of size " + spdiX.length + "." + System.lineSeparator() + " Line of problem: " + lineCounter + "." + System.lineSeparator() + " ID Batch: " + idsBatches.get(j) + "." + System.lineSeparator() + "Error is : " + e + ", specifically the array is size " + spdiInfoTemp.length); continue; } if (spdiInfoTemp.length > 3) { spdiInfo[3] = spdiInfoTemp[3]; } else { spdiInfo[3] = "-"; } //System.out.println(spdiIndex + " " + line2); //System.out.println(">" + spdiIndex + "_" + spdiInfo[0] + "_" + spdiInfo[1] + "_" + spdiInfo[2] + "_" + spdiInfo[3]); if(spdiInfo[2].equals(userFirstChoice) && spdiInfo[3].equals(userSecondChoice)) { sequencesCollected++; spdiArray.add(spdiInfo); String header = ">" + spdiIndex + "_" + spdiInfo[0] + "_" + spdiInfo[1] + "_" + spdiInfo[2] + "_" + spdiInfo[3] + " "; // BLOCK FOR 3RD WEBSITE // Pause Execution - cannot be more than 3 every second. Thread.sleep(timeDelay); int mutPos = Integer.parseInt(spdiInfo[1]); int startPos = mutPos - flank; int endPos = mutPos + flank; String seqResult = ""; /** * E-utility URL, built from searchTerm (all) and retMax and retStart */ //System.out.println("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=" + spdiInfo[0] + "&seq_start=" + startPos + "&seq_stop=" + endPos + "&rettype=fasta"); URL url3 = new URL("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=" + spdiInfo[0] + "&seq_start=" + startPos + "&seq_stop=" + endPos + "&rettype=fasta"); URLConnection con3 = url3.openConnection(); try (InputStream is3 = con3.getInputStream()) { BufferedReader br3 = new BufferedReader(new InputStreamReader(is3)); String line3 = null; while ((line3 = br3.readLine()) != null) { if(line3.contains(">")) { header += line3.replace(">", "") + System.lineSeparator(); } else { seqResult += line3; } } is3.close(); } catch (IOException e) { System.out.println("Got to " + (j-1)*batchSize + " entries, but encountered an error, as follows:"); System.out.println(e); } seqResult = seqResult.substring(0, flank+1) + representMut + seqResult.substring(flank+2); writeFile(header+seqResult + System.lineSeparator() + System.lineSeparator(), file1); } } } else if (line2.contains("uid")) { line2 = line2.replaceAll("[^\\d.]", ""); spdiIndex = line2; } } is2.close(); } catch (java.io.IOException e){ System.out.println("Encountered a problem with accessing URL for batch " + j + ". Exception: " + e); } } //END BLOCK FOR 2ND STREAM // BLOCK 2.5, save SPDI if requested if(saveSPDIs.equals("YES")) { File file3 = new File(pathName + "_SPDIs.txt"); writeFile(savedSPDI, file3); savedSPDI = ""; } // BLOCK TO FINALIZE AND OPEN FILE long endTime = (System.currentTimeMillis() - timeUnique)/1000; writeFile("Phew! It took me about " + endTime + " seconds, but I collected " + sequencesCollected + " sequences by checking " + mutationsQueried + " mutations across " + entriesSearched + " database entries, retrieving them in " + idsBatches.size() + " batches of up to " + batchSize + " sequences each." + System.lineSeparator(), file1); writeFile("The entries scanned had the following properties: ", file1); for (int i = 0; i < multipleMut.size(); i++) { writeFile(multipleMut.get(i) + " entries had " + (i+1) + " mutation(s). ", file1); } writeFile(System.lineSeparator() + "Good luck with your results. Live long and prosper!", file1); System.out.println("I'm done! Look ma, results! Please enter to exit"); String goodBye = scanner.nextLine(); System.out.println("Goodbye!"); openFile(file1); } private static Desktop desktop = Desktop.getDesktop(); private static void writeFile(String content, File file){ try (FileWriter fw = new FileWriter(file, true)) { fw.write(content);// } catch (IOException ex) { Logger.getLogger(snpSeq.class.getName()).log(Level.SEVERE, null, ex); } } private static void openFile(File file) { try { desktop.open(file); } catch (IOException ex) { Logger.getLogger( snpSeq.class.getName()).log( Level.SEVERE, null, ex ); } } private static int userDefault(int defaultVal) { Scanner scanner = new Scanner(System.in); String inputS; boolean choiceGood = false; while(!choiceGood) { try{ inputS = scanner.nextLine(); if(inputS.equals("")) { System.out.println("Great, using the default value of " + defaultVal); return defaultVal; } else { return Integer.parseInt(inputS); } } catch (NumberFormatException ex){ System.out.println("Whoops! looks like you didn't provide a valid number. Please try again!"); choiceGood = false; } } System.out.println("Great, using the default value of " + defaultVal); return defaultVal; } private static String userDefault(String defaultVal) { Scanner scanner = new Scanner(System.in); String inputS = scanner.nextLine(); if(inputS.equals("")) { System.out.println("Great, using the default value of " + defaultVal); return defaultVal; } else { return inputS; } } private static int userDefaultMax(int defaultVal, int maxVal) { Scanner scanner = new Scanner(System.in); String inputS = scanner.nextLine(); int result = 0; if(inputS.equals("")) { result = defaultVal; } else { result = Integer.parseInt(inputS); } if(result > maxVal) { result = maxVal; } System.out.println("Great, using the default value of " + result); return result; } public static boolean isInt(String str) { if (str == null) { return false; } try { Integer.parseInt(str); return true; } catch (NumberFormatException nfe) { return false; } } private static int randomNumber(int digits) { int res = 0; for (int i = 0; i < digits; i++) { res += (int)Math.random()*Math.pow(10, i); } return res; } }