From: Sarah Westcott Date: Tue, 1 Apr 2014 18:13:11 +0000 (-0400) Subject: working of get.mimarkspackage and sra command X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=f7184748e1519090deecfb6dd9fda118ffba2b53 working of get.mimarkspackage and sra command --- diff --git a/commandfactory.cpp b/commandfactory.cpp index e8e9c27..8c5c0fa 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -150,6 +150,7 @@ #include "kruskalwalliscommand.h" #include "sracommand.h" #include "mergesfffilecommand.h" +#include "getmimarkspackagecommand.h" /*******************************************************/ @@ -323,6 +324,7 @@ CommandFactory::CommandFactory(){ commands["kruskal.wallis"] = "kruskal.wallis"; commands["sra"] = "sra"; commands["merge.sfffiles"] = "merge.sfffiles"; + commands["get.mimarkspackage"] = "get.mimarkspackage"; } @@ -552,6 +554,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "kruskal.wallis") { command = new KruskalWallisCommand(optionString); } else if(commandName == "sra") { command = new SRACommand(optionString); } else if(commandName == "merge.sfffiles") { command = new MergeSfffilesCommand(optionString); } + else if(commandName == "get.mimarkspackage") { command = new GetMIMarksPackageCommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -722,6 +725,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "kruskal.wallis") { pipecommand = new KruskalWallisCommand(optionString); } else if(commandName == "sra") { pipecommand = new SRACommand(optionString); } else if(commandName == "merge.sfffiles") { pipecommand = new MergeSfffilesCommand(optionString); } + else if(commandName == "get.mimarkspackage") { pipecommand = new GetMIMarksPackageCommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -878,6 +882,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "kruskal.wallis") { shellcommand = new KruskalWallisCommand(); } else if(commandName == "sra") { shellcommand = new SRACommand(); } else if(commandName == "merge.sfffiles") { shellcommand = new MergeSfffilesCommand(); } + else if(commandName == "get.mimarkspackage") { shellcommand = new GetMIMarksPackageCommand(); } else { shellcommand = new NoCommand(); } return shellcommand; diff --git a/getmimarkspackagecommand.cpp b/getmimarkspackagecommand.cpp index 43138e1..2f37405 100644 --- a/getmimarkspackagecommand.cpp +++ b/getmimarkspackagecommand.cpp @@ -7,13 +7,17 @@ // #include "getmimarkspackagecommand.h" +#include "groupmap.h" //********************************************************************************************************************** vector GetMIMarksPackageCommand::setParameters(){ try { //files that have dependancies CommandParameter pgroup("group", "InputTypes", "", "", "groupOligos", "none", "none","",false,false); parameters.push_back(pgroup); + CommandParameter pfile("file", "InputTypes", "", "", "groupOligos", "none", "none","",false,false); parameters.push_back(pfile); CommandParameter poligos("oligos", "InputTypes", "", "", "groupOligos", "none", "none","",false,false); parameters.push_back(poligos); + CommandParameter ppackage("package", "Multiple", "air-host_associated-human_associated-human_gut-human_oral-human_skin-human_vaginal-microbial-miscellaneous-plant_associated-sediment-soil-wastewater-water", "miscellaneous", "", "", "","",false,false,true); parameters.push_back(ppackage); + CommandParameter prequiredonly("requiredonly", "Boolean", "", "F", "", "", "","",false,false, true); parameters.push_back(prequiredonly); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -30,14 +34,15 @@ vector GetMIMarksPackageCommand::setParameters(){ string GetMIMarksPackageCommand::getHelpString(){ try { string helpString = ""; - helpString += "The get.mimarkspackage command creates a mimarks package form with your groups. The required fields are flagged with * characters.\n"; + helpString += "The get.mimarkspackage command creates a mimarks package form with your groups. The required fields are flagged with * characters. Fields marked with '**' indicated they are in a group where at least one of the fields is required.\n"; helpString += "Further documentation on the different packages and required formats can be found here, http://www.mothur.org/wiki/MIMarks_Data_Packages.\n"; - helpString += "The get.mimarkspackage command parameters are: oligos, group and package. oligos or group is required.\n"; + helpString += "The get.mimarkspackage command parameters are: oligos, group, package and requiredonly. oligos or group is required.\n"; helpString += "The oligos parameter is used to provide your oligos file so mothur can extract your group names.\n"; helpString += "The group parameter is used to provide your group file so mothur can extract your group names.\n"; - helpString += "The package parameter is used to select the mimarks package you would like to use. Default=???\n"; + helpString += "The package parameter is used to select the mimarks package you would like to use. The choices are: air, host_associated, human_associated, human_gut, human_oral, human_skin, human_vaginal, microbial, miscellaneous, plant_associated, sediment, soil, wastewater or waterc. Default=miscellaneous.\n"; + helpString += "The requiredonly parameter is used to indicate you only want the required mimarks feilds printed. Default=F.\n"; helpString += "The get.mimarkspackage command should be in the following format: get.mimarkspackage(oligos=yourOligosFile, package=yourPackage)\n"; - helpString += "get.mimarkspackage(oligos=GQY1XT001.oligos, package=???)\n"; + helpString += "get.mimarkspackage(oligos=GQY1XT001.oligos, package=human_gut)\n"; return helpString; } catch(exception& e) { @@ -121,42 +126,57 @@ GetMIMarksPackageCommand::GetMIMarksPackageCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("file"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + } } groupfile = validParameter.validFile(parameters, "group", true); if (groupfile == "not open") { groupfile = ""; abort = true; } else if (groupfile == "not found") { groupfile = ""; } - else { m->setGroupFile(groupfile); } + else { m->setGroupFile(groupfile); inputfile = groupfile; } + + file = validParameter.validFile(parameters, "file", true); + if (file == "not open") { file = ""; abort = true; } + else if (file == "not found") { file = ""; } + else { inputfile = file; } oligosfile = validParameter.validFile(parameters, "oligos", true); if (oligosfile == "not found") { oligosfile = ""; } else if(oligosfile == "not open") { abort = true; } - else { m->setOligosFile(oligosfile); } + else { m->setOligosFile(oligosfile); inputfile = oligosfile; } - if ((groupfile != "") && (oligosfile != "")) { - m->mothurOut("[ERROR]: You may not use a group file and an oligos file, only one."); m->mothurOutEndLine(); abort = true; + if ((groupfile != "") && (oligosfile != "") && (file != "")) { + m->mothurOut("[ERROR]: You may not use a group file, file and an oligos file, only one."); m->mothurOutEndLine(); abort = true; } - if ((groupfile == "") && (oligosfile == "")) { + if ((groupfile == "") && (oligosfile == "") && (file == "")) { oligosfile = m->getOligosFile(); - if (oligosfile != "") { m->mothurOut("Using " + oligosfile + " as input file for the oligos parameter."); m->mothurOutEndLine(); } + if (oligosfile != "") { inputfile = oligosfile; m->mothurOut("Using " + oligosfile + " as input file for the oligos parameter."); m->mothurOutEndLine(); } else { groupfile = m->getGroupFile(); - if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } + if (groupfile != "") { inputfile = groupfile; m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("[ERROR]: You must provide groupfile or oligos file for the get.mimarkspackage command."); m->mothurOutEndLine(); abort = true; + m->mothurOut("[ERROR]: You must provide file, groupfile or oligos file for the get.mimarkspackage command."); m->mothurOutEndLine(); abort = true; } } } - package = validParameter.validFile(parameters, "package", false); if (package == "not found") { package = "package"; } - //if (!checkCasesPackage(package)) { abort = true; } //error message in function + package = validParameter.validFile(parameters, "package", false); if (package == "not found") { package = "miscellaneous"; } - //turn _ to spaces mothur's work around - for (int i = 0; i < package.length(); i++) { if (package[i] == '_') { package[i] = ' '; } } - - + if ((package == "air") || (package == "host_associated") || (package == "human_associated") || (package == "human_gut") || (package == "human_oral") || (package == "human_skin") || (package == "human_vaginal") || (package == "microbial") || (package == "miscellaneous") || (package == "plant_associated") || (package == "sediment") || (package == "soil") || (package == "wastewater") || (package == "water")) {} + else { + m->mothurOut("[ERROR]: " + package + " is not a valid package selection. Choices are: air, host_associated, human_associated, human_gut, human_oral, human_skin, human_vaginal, microbial, miscellaneous, plant_associated, sediment, soil, wastewater or water. Aborting.\n."); abort = true; + } + + string temp; + temp = validParameter.validFile(parameters, "requiredonly", false); if(temp == "not found"){ temp = "F"; } + requiredonly = m->isTrue(temp); } } @@ -172,10 +192,131 @@ int GetMIMarksPackageCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } - - + if (oligosfile != "") { readOligos(); } + else if (file != "") { readFile(); } + else { GroupMap groupmap(groupfile); groupmap.readMap(); Groups = groupmap.getNamesOfGroups(); } + if (outputDir == "") { outputDir += m->hasPath(inputfile); } + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(inputfile)); + string outputFileName = getOutputFileName("tsv", variables); + ofstream out; + m->openOutputFile(outputFileName, out); + outputNames.push_back(outputFileName); outputTypes["tsv"].push_back(outputFileName); + + out << "#This is a tab-delimited file. Additional Documentation can be found at http://www.mothur.org/wiki/MIMarks_Data_Packages." << endl; + out << "#Please fill all the required fields indicated with '*'" << endl; + out << "#Fields marked with '**' indicated they are in a group where at least one of the fields is required." << endl; + out << "#Unknown or inapplicable fields can be assigned NA value." << endl; + out << "#You may add extra custom fields to this template. Make sure all the fields are separated by tabs." << endl; + out << "#You may remove any fields not required (marked with '*'). Make sure all the fields are separated by tabs." << endl; + out << "#You can edit this template using Microsoft Excel or any other editor. But while saving the file please make sure to save them as 'TAB-DELIMITED' TEXT FILE." << endl; + + if (package == "air") { + out << "#Environmental:MIMARKS.specimen.air.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods *altitude" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process *altitude barometric_press carb_dioxide carb_monoxide chem_administration elev humidity methane misc_param organism_count oxygen oxy_stat_samp perturbation pollutants resp_part_matter samp_size samp_salinity samp_store_dur samp_store_loc samp_store_temp solar_irradiance temp ventilation_rate ventilation_type volatile_org_comp wind_direction wind_speed" << endl; + } + }else if (package == "host_associated") { + out << "#Environmental:MIMARKS.specimen.host-associated.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods *host **clone **isolate **strain" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods **clone **isolate **strain rel_to_oxygen samp_collect_device samp_mat_process *host age altitude blood_press_diast blood_press_syst body_habitat body_product tissue chem_administration depth diet disease_stat dry_mass elev family_relationship genotype gravidity height_or_length host_body_temp host_color host_growth_cond host_shape host_subject_id host_taxid infra_specific_name infra_specific_rank last_meal life_stage misc_param organism_count oxy_stat_samp perturbation phenotype samp_size samp_salinity samp_store_dur samp_store_loc samp_store_temp sex substrate temp tot_mass" << endl; + } + }else if (package == "human_associated") { + out << "#Environmental:MIMARKS.specimen.human-associated.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods *host" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process *host hiv_stat ihmc_ethnicity ihmc_medication_code age amniotic_fluid_color foetal_health_stat gestation_state maternal_health_stat blood_blood_disord body_product tissue body_mass_index chem_administration diet disease_stat drug_usage family_relationship genotype height host_body_temp host_subject_id last_meal nose_throat_disord pulmonary_disord diet_last_six_month medic_hist_perform misc_param occupation organism_count oxy_stat_samp perturbation phenotype pet_farm_animal pulse samp_size samp_salinity samp_store_dur samp_store_loc samp_store_temp sex smoker study_complt_stat temp tot_mass travel_out_six_month twin_sibling urine_collect_meth kidney_disord urogenit_tract_disor weight_loss_3_month" << endl; + } + }else if (package == "human_gut") { + out << "#Environmental:MIMARKS.specimen.human-gut.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods *host" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process *host ihmc_ethnicity ihmc_medication_code age body_product tissue body_mass_index chem_administration diet disease_stat family_relationship gastrointest_disord genotype height host_body_temp host_subject_id last_meal liver_disord medic_hist_perform misc_param occupation organism_count oxy_stat_samp perturbation phenotype pulse samp_size samp_salinity samp_store_dur samp_store_loc samp_store_temp sex special_diet temp tot_mass" << endl; + } + }else if (package == "human_oral") { + out << "#Environmental:MIMARKS.specimen.human-oral.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods *host" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process *host ihmc_ethnicity ihmc_medication_code age body_product tissue body_mass_index chem_administration diet disease_stat family_relationship genotype height host_body_temp host_subject_id last_meal medic_hist_perform misc_param nose_mouth_teeth_throat_disord occupation organism_count oxy_stat_samp perturbation phenotype pulse samp_size samp_salinity samp_store_dur samp_store_loc samp_store_temp sex temp time_last_toothbrush tot_mass" << endl; + } + }else if (package == "human_skin") { + out << "#Environmental:MIMARKS.specimen.human-skin.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods *host" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process *host ihmc_ethnicity ihmc_medication_code age body_product tissue body_mass_index chem_administration dermatology_disord diet disease_stat dominant_hand family_relationship genotype height host_body_temp host_subject_id last_meal medic_hist_perform misc_param occupation organism_count oxy_stat_samp perturbation phenotype pulse samp_size samp_salinity samp_store_dur samp_store_loc samp_store_temp sex temp time_since_last_wash tot_mass" << endl; + } + }else if (package == "human_vaginal") { + out << "#Environmental:MIMARKS.specimen.human-vaginal.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods *host" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process *host hrt ihmc_ethnicity ihmc_medication_code age birth_control body_product tissue body_mass_index chem_administration diet disease_stat douche family_relationship genotype gynecologic_disord height host_body_temp host_subject_id hysterectomy last_meal medic_hist_perform menarche menopause misc_param occupation organism_count oxy_stat_samp perturbation phenotype pregnancy pulse samp_size samp_salinity samp_store_dur samp_store_loc samp_store_temp sex sexual_act temp tot_mass urogenit_disord" << endl; + } + }else if (package == "microbial") { + out << "#Environmental:MIMARKS.specimen.microbial.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods *depth *elev" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process *depth *elev alkalinity alkyl_diethers altitude aminopept_act ammonium bacteria_carb_prod biomass bishomohopanol bromide calcium carb_nitro_ratio chem_administration chloride chlorophyll diether_lipids diss_carb_dioxide diss_hydrogen diss_inorg_carb diss_org_carb diss_org_nitro diss_oxygen glucosidase_act magnesium mean_frict_vel mean_peak_frict_vel methane misc_param n_alkanes nitrate nitrite nitro org_carb org_matter org_nitro organism_count oxy_stat_samp ph part_org_carb perturbation petroleum_hydrocarb phaeopigments phosphate phosplipid_fatt_acid potassium pressure redox_potential salinity samp_size samp_store_dur samp_store_loc samp_store_temp silicate sodium sulfate sulfide temp tot_carb tot_nitro tot_org_carb turbidity water_content" << endl; + } + }else if (package == "miscellaneous") { + out << "#Environmental:MIMARKS.specimen.miscellaneous.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *title *seq_methods *lat_lon" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process alkalinity altitude ammonium biomass bromide calcium chem_administration chloride chlorophyll current density depth diether_lipids diss_carb_dioxide diss_hydrogen diss_inorg_carb diss_org_nitro diss_oxygen elev misc_param nitrate nitrite nitro org_carb org_matter org_nitro organism_count oxy_stat_samp ph perturbation phosphate phosplipid_fatt_acid potassium pressure salinity samp_size samp_store_dur samp_store_loc samp_store_temp silicate sodium sulfate sulfide temp" << endl; + } + }else if (package == "plant_associated") { + out << "#Environmental:MIMARKS.specimen.plant-associated.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods *host" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process *host age air_temp_regm altitude antibiotic_regm body_product chem_administration chem_mutagen climate_environment depth disease_stat dry_mass elev fertilizer_regm fungicide_regm gaseous_environment genotype gravity growth_hormone_regm growth_med height_or_length herbicide_regm host_taxid humidity_regm infra_specific_name infra_specific_rank life_stage mechanical_damage mineral_nutr_regm misc_param non_mineral_nutr_regm organism_count oxy_stat_samp ph_regm perturbation pesticide_regm phenotype tissue plant_product radiation_regm rainfall_regm salt_regm samp_size samp_salinity samp_store_dur samp_store_loc samp_store_temp season_environment standing_water_regm temp tiss_cult_growth_med tot_mass water_temp_regm watering_regm wet_mass" << endl; + } + }else if (package == "sediment") { + out << "#Environmental:MIMARKS.specimen.sediment.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods *depth *elev" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process *depth *elev alkalinity alkyl_diethers aminopept_act ammonium bacteria_carb_prod biomass bishomohopanol bromide calcium carb_nitro_ratio chem_administration chloride chlorophyll density diether_lipids diss_carb_dioxide diss_hydrogen diss_inorg_carb diss_org_carb diss_org_nitro diss_oxygen glucosidase_act magnesium mean_frict_vel mean_peak_frict_vel methane misc_param n_alkanes nitrate nitrite nitro org_carb org_matter org_nitro organism_count oxy_stat_samp ph particle_class part_org_carb perturbation petroleum_hydrocarb phaeopigments phosphate phosplipid_fatt_acid porosity potassium pressure redox_potential salinity samp_size samp_store_dur samp_store_loc samp_store_temp sediment_type silicate sodium sulfate sulfide temp tidal_stage tot_carb tot_nitro tot_org_carb turbidity water_content" << endl; + } + }else if (package == "soil") { + out << "#Environmental:MIMARKS.specimen.soil.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods *depth *elev" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process *depth *elev altitude sieving cur_land_use cur_vegetation_meth cur_vegetation drainage_class al_sat al_sat_meth heavy_metals_meth heavy_metals salinity_meth extreme_salinity fao_class agrochem_addition crop_rotation extreme_event fire flooding previous_land_use_meth previous_land_use tillage horizon_meth horizon link_class_info link_climate_info link_addit_analys annual_season_precpt annual_season_temp microbial_biomass_meth microbial_biomass misc_param other ph_meth ph pool_dna_extracts profile_position samp_size samp_weight_dna_ext slope_aspect slope_gradient soil_type_meth soil_type local_class_meth local_class store_cond texture_meth texture tot_n_meth tot_n tot_org_c_meth tot_org_carb water_content_soil_meth water_content_soil" << endl; + } + }else if (package == "wastewater") { + out << "#Environmental:MIMARKS.specimen.wastewater.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process alkalinity biochem_oxygen_dem chem_administration chem_oxygen_dem depth efficiency_percent emulsions gaseous_substances indust_eff_percent inorg_particles misc_param nitrate org_particles organism_count oxy_stat_samp ph perturbation phosphate pre_treatment primary_treatment reactor_type samp_size samp_salinity samp_store_dur samp_store_loc samp_store_temp secondary_treatment sewage_type sludge_retent_time sodium soluble_inorg_mat soluble_org_mat suspend_solids temp tertiary_treatment tot_nitro tot_phosphate wastewater_type" << endl; + } + }else if (package == "water") { + out << "#Environmental:MIMARKS.specimen.water.3.0" << endl; + if (requiredonly) { + out << "*sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods *depth" << endl; + }else { + out << "*sample_name description bioproject_id sample_title *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon *title *seq_methods rel_to_oxygen samp_collect_device samp_mat_process *depth alkalinity alkyl_diethers aminopept_act ammonium atmospheric_data bacteria_carb_prod biomass bishomohopanol bromide calcium carb_nitro_ratio chem_administration chloride chlorophyll current density diether_lipids diss_carb_dioxide diss_hydrogen diss_inorg_carb diss_inorg_nitro diss_inorg_phosp diss_org_carb diss_org_nitro diss_oxygen elev glucosidase_act light_intensity magnesium mean_frict_vel mean_peak_frict_vel misc_param n_alkanes nitrate nitrite nitro org_carb org_matter org_nitro organism_count oxy_stat_samp ph part_org_carb part_org_nitro perturbation petroleum_hydrocarb phaeopigments phosphate phosplipid_fatt_acid photon_flux potassium pressure primary_prod redox_potential salinity samp_size samp_store_dur samp_store_loc samp_store_temp silicate sodium soluble_react_phosp sulfate sulfide suspend_part_matter temp tidal_stage tot_depth_water_col tot_diss_nitro tot_inorg_nitro tot_nitro tot_part_carb tot_phosp" << endl; + } + } + + for (int i = 0; i < Groups.size(); i++) { out << Groups[i] << '\t' << endl; } + + out.close(); + //output files created by command m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); @@ -185,7 +326,296 @@ int GetMIMarksPackageCommand::execute(){ } catch(exception& e) { - m->errorOut(e, "GetMIMarksPackageCommand", "GetMIMarksPackageCommand"); + m->errorOut(e, "GetMIMarksPackageCommand", "execute"); + exit(1); + } +} +//*************************************************************************************************************** +int GetMIMarksPackageCommand::readOligos(){ + try { + ifstream inOligos; + m->openInputFile(oligosfile, inOligos); + + string type, oligo, roligo, group; + vector primerNameVector, barcodeNameVector; + set uniquePrimers; + set uniqueBarcodes; + + while(!inOligos.eof()){ + + inOligos >> type; + + if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); } + + if(type[0] == '#'){ + while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + m->gobble(inOligos); + } + else{ + m->gobble(inOligos); + //make type case insensitive + for(int i=0;i> oligo; + + if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); } + + for(int i=0;igobble(inOligos); + + inOligos >> roligo; + + for(int i=0;i> group; + + //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs + //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info + + string temp = ""; + while (!inOligos.eof()) { + char c = inOligos.get(); + if (c == 10 || c == 13 || c == -1){ break; } + else if (c == 32 || c == 9){;} //space or tab + else { temp += c; } + } + + //then this is illumina data with 4 columns + if (temp != "") { + + string reverseBarcode = group; //reverseOligo(group); //reverse barcode + group = temp; + + barcodeNameVector.push_back(group); + }else { + barcodeNameVector.push_back(group); + } + } + } + m->gobble(inOligos); + } + inOligos.close(); + + //add in potential combos + if(barcodeNameVector.size() == 0){ + barcodeNameVector.push_back(""); + } + + if(primerNameVector.size() == 0){ + primerNameVector.push_back(""); + } + + set uniqueNames; + for(int i = 0; i < barcodeNameVector.size(); i++){ + for(int j = 0; j < primerNameVector.size(); j++){ + + string primerName = primerNameVector[j]; + string barcodeName = barcodeNameVector[i]; + + if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing + else if ((primerName == "") && (barcodeName == "")) { } + else { + string comboGroupName = ""; + + if(primerName == ""){ + comboGroupName = barcodeNameVector[i]; + } + else{ + if(barcodeName == ""){ + comboGroupName = primerNameVector[j]; + } + else{ + comboGroupName = barcodeNameVector[i] + "." + primerNameVector[j]; + } + } + uniqueNames.insert(comboGroupName); + } + } + } + + + + if (m->debug) { int count = 0; for (set::iterator it = uniqueNames.begin(); it != uniqueNames.end(); it++) { m->mothurOut("[DEBUG]: " + toString(count) + " groupName = " + *it + "\n"); count++; } } + + for (set::iterator it = uniqueNames.begin(); it != uniqueNames.end(); it++) { Groups.push_back(*it); } + + return true; + + } + catch(exception& e) { + m->errorOut(e, "GetMIMarksPackageCommand", "readOligos"); + exit(1); + } +} +//********************************************************************************************************************** +// going to have to rework this to allow for other options -- +/* + file option 1 + + sfffile1 oligosfile1 + sfffile2 oligosfile2 + ... + + file option 2 + + fastqfile1 oligosfile1 + fastqfile2 oligosfile2 + ... + + file option 3 + + fastqfile fastqfile group + fastqfile fastqfile group + fastqfile fastqfile group + ... + + */ + +int GetMIMarksPackageCommand::readFile(){ + try { + //vector theseFiles; + inputfile = file; + + ifstream in; + m->openInputFile(file, in); + + while(!in.eof()) { + + if (m->control_pressed) { return 0; } + + string line = m->getline(in); m->gobble(in); + vector pieces = m->splitWhiteSpace(line); + + string group = ""; + string thisFileName1, thisFileName2; thisFileName1 = ""; thisFileName2 = ""; + if (pieces.size() == 2) { + thisFileName1 = pieces[0]; + thisFileName2 = pieces[1]; + }else if (pieces.size() == 3) { + thisFileName1 = pieces[1]; + thisFileName2 = pieces[2]; + string group = pieces[0]; + }else { + m->mothurOut("[ERROR]: file lines can be 2 or 3 columns. The 2 column files are sff file then oligos or fastqfile then oligos. You may have multiple lines in the file. The 3 column files are for paired read libraries. The format is groupName, forwardFastqFile reverseFastqFile. \n"); m->control_pressed = true; + } + + if (m->debug) { m->mothurOut("[DEBUG]: group = " + group + ", thisFileName1 = " + thisFileName1 + ", thisFileName2 = " + thisFileName2 + ".\n"); } + + //check to make sure both are able to be opened + ifstream in2; + int openForward = m->openInputFile(thisFileName1, in2, "noerror"); + + //if you can't open it, try default location + if (openForward == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(thisFileName1); + m->mothurOut("Unable to open " + thisFileName1 + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in3; + openForward = m->openInputFile(tryPath, in3, "noerror"); + in3.close(); + thisFileName1 = tryPath; + } + } + + //if you can't open it, try output location + if (openForward == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(thisFileName1); + m->mothurOut("Unable to open " + thisFileName1 + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in4; + openForward = m->openInputFile(tryPath, in4, "noerror"); + thisFileName1 = tryPath; + in4.close(); + } + } + + if (openForward == 1) { //can't find it + m->mothurOut("[WARNING]: can't find " + thisFileName1 + ", ignoring.\n"); + }else{ in2.close(); } + + int openReverse = 1; + + ifstream in3; + openReverse = m->openInputFile(thisFileName2, in3, "noerror"); + + //if you can't open it, try default location + if (openReverse == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(thisFileName2); + m->mothurOut("Unable to open " + thisFileName2 + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in3; + openReverse = m->openInputFile(tryPath, in3, "noerror"); + in3.close(); + thisFileName2 = tryPath; + } + } + + //if you can't open it, try output location + if (openReverse == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(thisFileName2); + m->mothurOut("Unable to open " + thisFileName2 + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in4; + openReverse = m->openInputFile(tryPath, in4, "noerror"); + thisFileName2 = tryPath; + in4.close(); + } + } + + if (openReverse == 1) { //can't find it + m->mothurOut("[WARNING]: can't find " + thisFileName2 + ", ignoring pair.\n"); + }else{ in3.close(); } + + + if ((pieces.size() == 2) && (openForward != 1) && (openReverse != 1)) { //good pair and sff or fastq and oligos + oligosfile = thisFileName2; + if (m->debug) { m->mothurOut("[DEBUG]: about to read oligos\n"); } + readOligos(); + }else if((pieces.size() == 3) && (openForward != 1) && (openReverse != 1)) { //good pair and paired read + Groups.push_back(group); + } + } + in.close(); + + inputfile = file; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "GetMIMarksPackageCommand", "readFile"); exit(1); } } diff --git a/getmimarkspackagecommand.h b/getmimarkspackagecommand.h index e45dfe7..ccef832 100644 --- a/getmimarkspackagecommand.h +++ b/getmimarkspackagecommand.h @@ -32,10 +32,13 @@ public: void help() { m->mothurOut(getHelpString()); } private: - bool abort; - string oligosfile, groupfile, package; + bool abort, requiredonly; + string oligosfile, groupfile, package, inputfile, file; string outputDir; - vector outputNames; + vector outputNames, Groups; + + int readOligos(); + int readFile(); }; /**************************************************************************************************/ diff --git a/mothurout.cpp b/mothurout.cpp index 313deef..baa7710 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -2823,7 +2823,7 @@ bool MothurOut::isSubset(vector bigset, vector subset) { if (subset.size() > bigset.size()) { return false; } - //check if each guy in suset is also in bigset + //check if each guy in subset is also in bigset for (int i = 0; i < subset.size(); i++) { bool match = false; for (int j = 0; j < bigset.size(); j++) { @@ -3599,6 +3599,26 @@ bool MothurOut::inUsersGroups(vector groupnames, vector Groups) exit(1); } } +/**************************************************************************************************/ +//removes entries that are only white space +int MothurOut::removeBlanks(vector& tempVector) { + try { + vector newVector; + for (int i = 0; i < tempVector.size(); i++) { + bool isBlank = true; + for (int j = 0; j < tempVector[i].length(); j++) { + if (!isspace(tempVector[i][j])) { isBlank = false; j+= tempVector[i].length(); } //contains non space chars, break out and save + } + if (!isBlank) { newVector.push_back(tempVector[i]); } + } + tempVector = newVector; + return 0; + } + catch(exception& e) { + errorOut(e, "MothurOut", "removeBlanks"); + exit(1); + } +} /***********************************************************************/ //this function determines if the user has given us labels that are smaller than the given label. //if so then it returns true so that the calling function can run the previous valid distance. diff --git a/mothurout.h b/mothurout.h index d5b7e5f..a57fb13 100644 --- a/mothurout.h +++ b/mothurout.h @@ -132,7 +132,7 @@ class MothurOut { //searchs and checks bool checkReleaseVersion(ifstream&, string); bool anyLabelsToProcess(string, set&, string); - bool inUsersGroups(vector, vector); + bool inUsersGroups(vector, vector); //returns true if any of the strings in first vector are in second vector bool inUsersGroups(vector, vector< vector >); bool inUsersGroups(string, vector); bool inUsersGroups(int, vector); @@ -158,6 +158,7 @@ class MothurOut { void splitAtDash(string&, vector&); void splitAtChar(string&, vector&, char); void splitAtChar(string&, string&, char); + int removeBlanks(vector&); vector splitWhiteSpaceWithQuotes(string); int removeConfidences(string&); string removeQuotes(string); diff --git a/sracommand.cpp b/sracommand.cpp index aa35365..02e9899 100644 --- a/sracommand.cpp +++ b/sracommand.cpp @@ -14,18 +14,19 @@ vector SRACommand::setParameters(){ try { CommandParameter psff("sff", "InputTypes", "", "", "sffFastQFile", "sffFastQFile", "none","xml",false,false); parameters.push_back(psff); - CommandParameter pgroup("group", "InputTypes", "", "", "groupOligos", "none", "none","",false,false); parameters.push_back(pgroup); - CommandParameter poligos("oligos", "InputTypes", "", "", "groupOligos", "none", "none","",false,false); parameters.push_back(poligos); - CommandParameter pfile("file", "InputTypes", "", "", "sffFastQFile", "sffFastQFile", "none","xml",false,false); parameters.push_back(pfile); + CommandParameter poligos("oligos", "InputTypes", "", "", "oligos", "none", "none","",false,false,true); parameters.push_back(poligos); + CommandParameter pfile("file", "InputTypes", "", "", "sffFastQFile-oligos", "sffFastQFile", "none","xml",false,false); parameters.push_back(pfile); CommandParameter pfastq("fastq", "InputTypes", "", "", "sffFastQFile", "sffFastQFile", "none","xml",false,false); parameters.push_back(pfastq); - CommandParameter pcontact("contact", "InputTypes", "", "", "none", "none", "none","xml",false,true,true); parameters.push_back(pcontact); + CommandParameter pcontact("project", "InputTypes", "", "", "none", "none", "none","xml",false,true,true); parameters.push_back(pcontact); + CommandParameter pmimark("mimark", "InputTypes", "", "", "none", "none", "none","xml",false,true,true); parameters.push_back(pmimark); //choose only one multiple options CommandParameter pplatform("platform", "Multiple", "_LS454-ILLUMINA-ION_TORRENT-PACBIO_SMRT", "_LS454", "", "", "","",false,false); parameters.push_back(pplatform); CommandParameter pinstrument("instrument", "Multiple", "454_GS-454_GS_20-454_GS_FLX-454_GS_FLX_Titanium-454_GS_Junior-Illumina_Genome_Analyzer-Illumina_Genome_Analyzer_II-Illumina_Genome_Analyzer_IIx-Illumina_HiSeq_2000-Illumina_HiSeq_1000-Illumina_MiSeq-PacBio_RS-Ion_Torrent_PGM-unspecified", "454_GS", "", "", "","",false,false); parameters.push_back(pinstrument); CommandParameter plibstrategy("libstrategy", "String", "AMPLICON", "", "", "", "","",false,false); parameters.push_back(plibstrategy); + CommandParameter pdatatype("datatype", "String", "METAGENOME", "", "", "", "","",false,false); parameters.push_back(pdatatype); CommandParameter plibsource("libsource", "String", "METAGENOMIC", "", "", "", "","",false,false); parameters.push_back(plibsource); CommandParameter plibselection("libselection", "String", "PCR", "", "", "", "","",false,false); parameters.push_back(plibselection); - + CommandParameter porientation("orientation", "Multiple", "forward-reverse", "forward", "", "", "","",false,false); parameters.push_back(porientation); CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ppdiffs); CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pbdiffs); CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pldiffs); @@ -50,26 +51,26 @@ string SRACommand::getHelpString(){ try { string helpString = ""; helpString += "The sra command creates the necessary files for a NCBI submission. The xml file and individual sff or fastq files parsed from the original sff or fastq file.\n"; - helpString += "The sra command parameters are: sff, fastq, file, oligos, contact, pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, group, platform, libstrategy, libsource, libselection and instrument.\n"; + helpString += "The sra command parameters are: sff, fastq, file, oligos, project, mimarksfile, pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, platform, orientation, libstrategy, datatype, libsource, libselection and instrument.\n"; helpString += "The sff parameter is used to provide the original sff file.\n"; helpString += "The fastq parameter is used to provide the original fastq file.\n"; - helpString += "The contact parameter is used to provide your contact file.\n"; - helpString += "The oligos parameter is used to provide an oligos file to parse your sff or fastq file by.\n"; - helpString += "The group parameter is used to provide the group file to parse your sff or fastq file by.\n"; + helpString += "The project parameter is used to provide your project file.\n"; + helpString += "The oligos parameter is used to provide an oligos file to parse your sff or fastq file by. It is required and must contain barcodes and primers, or you must provide a file option. \n"; + helpString += "The mimark parameter is used to provide your mimarks file. You can create the template for this file using the get.mimarkspackage command.\n"; helpString += "The file parameter is used to provide a file containing a list of individual fastq or sff files or paired fastq files with a group assignment. File lines can be 2 or 3 columns. The 2 column files are sff file then oligos or fastqfile then oligos. You may have multiple lines in the file. The 3 column files are for paired read libraries. The format is groupName, forwardFastqFile reverseFastqFile.\n"; helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; - helpString += "The platform parameter is used to specify platfrom you are using choices are: _LS454,ILLUMINA,ION_TORRENT,PACBIO_SMRT. Default=_LS454. This is a controlled vocabulary section in the XML file that will be generated.\n"; + helpString += "The platform parameter is used to specify platform you are using choices are: _LS454,ILLUMINA,ION_TORRENT,PACBIO_SMRT. Default=_LS454. This is a controlled vocabulary section in the XML file that will be generated.\n"; + helpString += "The orientation parameter is used to specify sequence orientation. Choices are: forward and reverse. Default=forward. This is a controlled vocabulary section in the XML file that will be generated.\n"; helpString += "The instrument parameter is used to specify instrument. Choices are 454_GS-454_GS_20-454_GS_FLX-454_GS_FLX_Titanium-454_GS_Junior-Illumina_Genome_Analyzer-Illumina_Genome_Analyzer_II-Illumina_Genome_Analyzer_IIx-Illumina_HiSeq_2000-Illumina_HiSeq_1000-Illumina_MiSeq-PacBio_RS-Ion_Torrent_PGM-unspecified. Default=454_GS. This is a controlled vocabulary section in the XML file that will be generated. \n"; helpString += "The libstrategy parameter is used to specify library strategy. Default=AMPLICON. Choices are AMPLICON,WGA,WGS,WGX,RNA-Seq,miRNA-Seq,WCS,CLONE,POOLCLONE,CLONEEND,FINISHING,ChIP-Seq,MNase-Seq,DNase-Hypersensitivity,Bisulfite-Seq,Tn-Seq,EST,FL-cDNA,CTS,MRE-Seq,MeDIP-Seq,MBD-Seq,OTHER. This is a controlled vocabulary section in the XML file that will be generated. \n"; helpString += "The libsource parameter is used to specify library source. Default=METAGENOMIC. Choices are METAGENOMIC,GENOMIC,TRANSCRIPTOMIC,METATRANSCRIPTOMIC,SYNTHETIC,VIRAL_RNA,OTHER. This is a controlled vocabulary section in the XML file that will be generated. \n"; helpString += "The libselection parameter is used to specify library selection. Default=PCR. Choices are PCR,RANDOM,RANDOM_PCR,RT-PCR,HMPR,MF,CF-S,CF-H,CF-T,CF-M,MDA,MSLL,cDNA,ChIP,MNase,DNAse,Hybrid_Selection,Reduced_Representation,Restriction_Digest,5-methylcytidine_antibody,MBD2_protein_methyl-CpG_binding_domain,CAGE,RACE,size_fractionation,Padlock_probes_capture_method,other,unspecified. This is a controlled vocabulary section in the XML file that will be generated. \n"; - - helpString += "The sra should be in the following format: \n"; - helpString += "sra(...)\n"; + helpString += "The datatype parameter is used to specify datatype. Default=METAGENOME. Choices are METAGENOME,GENOME_SEQUENCING,METAGENOMIC_ASSEMBLY,ASSEMBLY,TRANSCRIPTOME,PROTEOMIC,MAP,CLONE_ENDS,TARGETED_LOCI,RANDOM_SURVEY,EXOME,VARIATION,EPIGENOMICS,PHENOTYPE,GENOTYPE,OTHER. This is a controlled vocabulary section in the XML file that will be generated. \n"; + helpString += "sra(sff=sff=GHL4YHV01.sff, GHL4YHV01.oligos, project=test.project, mimark=MIMarksData.txt)\n"; return helpString; } catch(exception& e) { @@ -162,28 +163,28 @@ SRACommand::SRACommand(string option) { if (path == "") { parameters["file"] = inputDir + it->second; } } - it = parameters.find("group"); + it = parameters.find("oligos"); //user has given a template file if(it != parameters.end()){ path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["group"] = inputDir + it->second; } + if (path == "") { parameters["oligos"] = inputDir + it->second; } } - it = parameters.find("oligos"); + it = parameters.find("project"); //user has given a template file if(it != parameters.end()){ path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["oligos"] = inputDir + it->second; } + if (path == "") { parameters["project"] = inputDir + it->second; } } - it = parameters.find("contact"); + it = parameters.find("mimark"); //user has given a template file if(it != parameters.end()){ path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["contact"] = inputDir + it->second; } + if (path == "") { parameters["mimark"] = inputDir + it->second; } } } @@ -200,46 +201,31 @@ SRACommand::SRACommand(string option) { if (file == "not open") { file = ""; abort = true; } else if (file == "not found") { file = ""; } - groupfile = validParameter.validFile(parameters, "group", true); - if (groupfile == "not open") { groupfile = ""; abort = true; } - else if (groupfile == "not found") { groupfile = ""; } - else { m->setGroupFile(groupfile); } - oligosfile = validParameter.validFile(parameters, "oligos", true); - if (oligosfile == "not found") { oligosfile = ""; } - else if(oligosfile == "not open") { abort = true; } + if (oligosfile == "not found") { oligosfile = ""; } + else if(oligosfile == "not open") { abort = true; } else { m->setOligosFile(oligosfile); } - contactfile = validParameter.validFile(parameters, "contact", true); - if (contactfile == "not found") { contactfile = ""; m->mothurOut("[ERROR]: You must provide a contact file before you can use the sra command."); m->mothurOutEndLine(); abort = true; } + contactfile = validParameter.validFile(parameters, "project", true); + if (contactfile == "not found") { contactfile = ""; m->mothurOut("[ERROR]: You must provide a project file before you can use the sra command."); m->mothurOutEndLine(); abort = true; } else if(contactfile == "not open") { abort = true; } + mimarksfile = validParameter.validFile(parameters, "mimark", true); + if (mimarksfile == "not found") { mimarksfile = ""; m->mothurOut("[ERROR]: You must provide a mimark file before you can use the sra command. You can create a template for this file using the get.mimarkspackage command."); m->mothurOutEndLine(); abort = true; } + else if(mimarksfile == "not open") { abort = true; } + file = validParameter.validFile(parameters, "file", true); if (file == "not open") { file = ""; abort = true; } else if (file == "not found") { file = ""; } - if ((fastqfile == "") && (sfffile == "") && (sfffile == "")) { - m->mothurOut("[ERROR]: You must provide a file, sff file or fastq file before you can use the sra command."); m->mothurOutEndLine(); abort = true; + if ((file == "") && (oligosfile == "")) { + m->mothurOut("[ERROR]: You must provide an oligos file or file with oligos files in them before you can use the sra command."); m->mothurOutEndLine(); abort = true; } - if ((groupfile != "") && (oligosfile != "")) { - m->mothurOut("[ERROR]: You may not use a group file and an oligos file, only one."); m->mothurOutEndLine(); abort = true; + if ((fastqfile == "") && (file == "") && (sfffile == "")) { + m->mothurOut("[ERROR]: You must provide a file, sff file or fastq file before you can use the sra command."); m->mothurOutEndLine(); abort = true; } - if ((fastqfile != "") || (sfffile != "")) { - if ((groupfile == "") && (oligosfile == "")) { - oligosfile = m->getOligosFile(); - if (oligosfile != "") { m->mothurOut("Using " + oligosfile + " as input file for the oligos parameter."); m->mothurOutEndLine(); } - else { - groupfile = m->getGroupFile(); - if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } - else { - m->mothurOut("[ERROR]: You must provide groupfile or oligos file if splitting a fastq or sff file."); m->mothurOutEndLine(); abort = true; - } - } - } - } - //use only one Mutliple type _LS454-ILLUMINA-ION_TORRENT-PACBIO_SMRT platform = validParameter.validFile(parameters, "platform", false); if (platform == "not found") { platform = "_LS454"; } if (!checkCasesPlatforms(platform)) { abort = true; } //error message in function @@ -269,6 +255,17 @@ SRACommand::SRACommand(string option) { //turn _ to spaces mothur's work around for (int i = 0; i < libSelection.length(); i++) { if (libSelection[i] == '_') { libSelection[i] = ' '; } } + + dataType = validParameter.validFile(parameters, "datatype", false); if (dataType == "not found") { dataType = "METAGENOME"; } + if (!checkCasesDataType(dataType)) { abort = true; } //error message in function + + //turn _ to spaces mothur's work around + for (int i = 0; i < dataType.length(); i++) { if (dataType[i] == '_') { dataType[i] = ' '; } } + + orientation = validParameter.validFile(parameters, "orientation", false); if (orientation == "not found") { orientation = "forward"; } + + if ((orientation == "forward") || (orientation == "reverse")) { } + else { m->mothurOut("[ERROR]: " + orientation + " is not a valid orientation option. Choices are: forward and reverse.\n"); m->mothurOutEndLine(); abort = true; } string temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found"){ temp = "0"; } @@ -303,8 +300,8 @@ int SRACommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } readContactFile(); - if (oligosfile != "") { readOligos(); Groups.push_back("scrap"); } - if (groupfile != "") { GroupMap groupmap(groupfile); groupmap.readMap(); Groups = groupmap.getNamesOfGroups(); Groups.push_back("scrap"); } + readMIMarksFile(); + if (oligosfile != "") { readOligos(); Groups.push_back("scrap"); } if (m->control_pressed) { return 0; } @@ -316,6 +313,8 @@ int SRACommand::execute(){ else if (sfffile != "") { parseSffFile(filesBySample); } else if (fastqfile != "") { parseFastqFile(filesBySample); } + sanityCheckMiMarksGroups(); + //checks groups and files returned from parse - removes any groups that did not get reads assigned to them, orders files. checkGroups(filesBySample); @@ -355,25 +354,21 @@ int SRACommand::execute(){ out << "\t\t\t\t\n"; out << "\t\t\t\t\t\n"; out << "\t\t\t\t\t\t\n"; - ///////////////////////out << "\t\t\t\t\t\t" + ProjectID + " \n"; + out << "\t\t\t\t\t\t" + projectName + " \n"; out << "\t\t\t\t\t\t\n"; out << "\t\t\t\t\t\t\n"; - ////////////////////out << "\t\t\t\t\t\t\t" + title + " \n"; + out << "\t\t\t\t\t\t\t" + projectTitle + " \n"; out << "\t\t\t\t\t\t\t

" + description + "

\n"; - out << "\t\t\t\t\t\t\t\n"; - /////////////////////////out << "\t\t\t\t\t\t\t\t" + website + "\n"; - out << "\t\t\t\t\t\t\t\n"; - out << "\t\t\t\t\t\t\t\n"; - //////////////////////out << "\t\t\t\t\t\t\t\t" + medicalRelevance + "\n"; - out << "\t\t\t\t\t\t\t\n"; + if (website != "") { + out << "\t\t\t\t\t\t\t\n"; + out << "\t\t\t\t\t\t\t\t" + website + "\n"; + out << "\t\t\t\t\t\t\t\n"; + } out << "\t\t\t\t\t\t
\n"; out << "\t\t\t\t\t\t\n"; - /////////////////////////out << "\t\t\t\t\t\t\t\n"; // - out << "\t\t\t\t\t\t\t\t\n"; - ////////////////////out << "\t\t\t\t\t\t\t\t\t" + scientificName + " \n"; - out << "\t\t\t\t\t\t\t\t\n"; + out << "\t\t\t\t\t\t\t\n"; out << "\t\t\t\t\t\t\t\t\n"; - ////////////////////out << "\t\t\t\t\t\t\t\t\t" + dataType + " \n"; + out << "\t\t\t\t\t\t\t\t\t" + dataType + " \n"; out << "\t\t\t\t\t\t\t\t\n"; out << "\t\t\t\t\t\t\t\n"; out << "\t\t\t\t\t\t\n"; @@ -381,7 +376,7 @@ int SRACommand::execute(){ out << "\t\t\t\t
\n"; out << "\t\t\t\n"; out << "\t\t\t\n"; - ////////////////////////////out << "\t\t\t\t" + ProjectID + " \n"; + out << "\t\t\t\t\t\t" + projectName + " \n"; out << "\t\t\t\n"; out << "\t\t\n"; out << "\t\n"; @@ -391,102 +386,140 @@ int SRACommand::execute(){ //////////////////////////////////////////////////////// for (int i = 0; i < Groups.size(); i++) { - vector thisGroupsFiles = filesBySample[Groups[i]]; string barcodeForThisSample = Group2Barcode[Groups[i]]; - for (int j = 0; j < thisGroupsFiles.size(); j++) { - if (m->control_pressed) { break; } - out << "\t\n"; - out << "\t\t\n"; - out << "\t\t\t\n"; - out << "\t\t\t\t\n"; - out << "\t\t\t\t\t\n"; - out << "\t\t\t\t\t\t\n"; - out << "\t\t\t\t\t\t" + Groups[i] + " \n"; - out << "\t\t\t\t\t\t\n"; - out << "\t\t\t\t\t\t\n"; - ////////////////////out << "\t\t\t\t\t\t\t" + title + " \n"; - out << "\t\t\t\t\t\t\n"; - out << "\t\t\t\t\t\t\n"; - ////////////////////out << "\t\t\t\t\t\t\t" + scientificName + " \n"; - out << "\t\t\t\t\t\t\n"; - out << "\t\t\t\t\t\t\n"; - ///////////////////////out << "\t\t\t\t\t\t\t" + BioProject + " \n"; - out << "\t\t\t\t\t\t\n"; - out << "\t\t\t\t\t\tMIMARKS.specimenn"; - out << "\t\t\t\t\t\tn"; - //add biosample required attributes - /////////////////////////////////////////////////////////////////////// - - out << "\t\t\t\t\t\tn"; - out << "\t\t\t\t\t\n"; - out << "\t\t\t\t\n"; - out << "\t\t\t\n"; - - //libID - out << "\t\t\t\n"; - string libId = thisGroupsFiles[j] + barcodeForThisSample; - if (libLayout == "paired") { //adjust the libID because the thisGroupsFiles[j] contains two filenames - vector pieces = m->splitWhiteSpace(thisGroupsFiles[j]); - libId = pieces[0] + barcodeForThisSample; + if (m->control_pressed) { break; } + out << "\t\n"; + out << "\t\t\n"; + out << "\t\t\t\n"; + out << "\t\t\t\t\n"; + out << "\t\t\t\t\t\n"; + out << "\t\t\t\t\t\t\n"; + out << "\t\t\t\t\t\t" + Groups[i] + " \n"; + out << "\t\t\t\t\t\t\n"; + out << "\t\t\t\t\t\t\n"; + string organismName = "metagenome"; + map::iterator itOrganism = Group2Organism.find(Groups[i]); + if (itOrganism != Group2Organism.end()) { organismName = itOrganism->second; } //user supplied acceptable organism, so use it. + out << "\t\t\t\t\t\t\t" + organismName + " \n"; + out << "\t\t\t\t\t\t\n"; + out << "\t\t\t\t\t\t" + packageType + "n"; + out << "\t\t\t\t\t\tn"; + //add biosample required attributes + map >:: iterator it = mimarks.find(Groups[i]); + if (it != mimarks.end()) { + map categories = it->second; + for (map:: iterator it2 = categories.begin(); it2 != categories.end(); it2++) { + if (m->control_pressed) { break; } + out << "\t\t\t\t\t\t\tfirst + "\">\"" + it2->second + "\"\n"; } - out << "\t\t\t\t" + libId + " \n"; - out << "\t\t\t\n"; - - out << "\t\t\n"; - out << "\t\n"; } + out << "\t\t\t\t\t\tn"; + out << "\t\t\t\t\t\n"; + out << "\t\t\t\t\n"; + out << "\t\t\t\n"; + out << "\t\t\t\n"; + out << "\t\t\t\t" + Groups[i] + " \n"; + out << "\t\t\t\n"; + out << "\t\t\n"; + out << "\t\n"; } + //File objects + //////////////////////////////////////////////////////// for (int i = 0; i < Groups.size(); i++) { vector thisGroupsFiles = filesBySample[Groups[i]]; string barcodeForThisSample = Group2Barcode[Groups[i]]; for (int j = 0; j < thisGroupsFiles.size(); j++) { - if (m->control_pressed) { break; } + string libId = thisGroupsFiles[j] + "." + barcodeForThisSample; + + if (m->control_pressed) { break; } out << "\t\n"; out << "\t\t\n"; if (libLayout == "paired") { //adjust the libID because the thisGroupsFiles[j] contains two filenames vector pieces = m->splitWhiteSpace(thisGroupsFiles[j]); + libId = pieces[0] + barcodeForThisSample; out << "\t\t\t\n"; - ////////////////////out << "\t\t\t\tfastq \n"; //since its paired we know its fastq, is the dataType the fileType??? + out << "\t\t\t\tgeneric-data \n"; out << "\t\t\t\n"; + vector thisBarcodes; m->splitAtChar(Group2Barcode[Groups[i]], thisBarcodes, '.'); + string forwardBarcode = thisBarcodes[0]; + string reverseBarcode = thisBarcodes[1]; + vector thisPrimers; m->splitAtChar(Group2Primer[Groups[i]], thisPrimers, '.'); + string forwardPrimer = thisPrimers[0]; + string reversePrimer = thisPrimers[1]; + //attributes + out << "\t\t\t" + mimarks[Groups[i]]["title"] + "\n"; + out << "\t\t\t" + forwardBarcode + "\n"; + out << "\t\t\t" + forwardPrimer + "\n"; + out << "\t\t\tforward\n"; + out << "\t\t\t" + libId + "\n"; + out << "\t\t\t" + libStrategy + "\n"; + out << "\t\t\t" + libSource + "\n"; + out << "\t\t\t" + libSelection + "\n"; + out << "\t\t\t" + libLayout + "\n"; + out << "\t\t\t" + instrumentModel + "\n"; + out << "\t\t\t" + mimarks[Groups[i]]["seq_methods"] + "\n"; + out << "\t\t\t\n"; - ////////////////////out << "\t\t\t\tfastq \n"; //since its paired we know its fastq, is the dataType the fileType??? + out << "\t\t\t\tgeneric-data \n"; out << "\t\t\t\n"; + out << "\t\t\t" + mimarks[Groups[i]]["title"] + "\n"; + out << "\t\t\t" + reverseBarcode + "\n"; + out << "\t\t\t" + reversePrimer + "\n"; + out << "\t\t\treverse\n"; + out << "\t\t\t" + libId + "\n"; + out << "\t\t\t" + libStrategy + "\n"; + out << "\t\t\t" + libSource + "\n"; + out << "\t\t\t" + libSelection + "\n"; + out << "\t\t\t" + libLayout + "\n"; + out << "\t\t\t" + instrumentModel + "\n"; + out << "\t\t\t" + mimarks[Groups[i]]["seq_methods"] + "\n"; + }else { //single out << "\t\t\t\n"; - string dataType = "fastq"; - if (isSFF) { dataType = "sff"; } - ////////////////////out << "\t\t\t\t" + dataType + " \n"; //is the dataType the fileType??? + out << "\t\t\t\tgeneric-data \n"; out << "\t\t\t\n"; + //attributes + out << "\t\t\t" + mimarks[Groups[i]]["title"] + "\n"; + out << "\t\t\t" + Group2Barcode[Groups[i]] + "\n"; + out << "\t\t\t" + Group2Primer[Groups[i]] + "\n"; + out << "\t\t\t" + orientation + "\n"; + out << "\t\t\t" + libId + "\n"; + out << "\t\t\t" + libStrategy + "\n"; + out << "\t\t\t" + libSource + "\n"; + out << "\t\t\t" + libSelection + "\n"; + out << "\t\t\t" + libLayout + "\n"; + out << "\t\t\t" + instrumentModel + "\n"; + out << "\t\t\t" + mimarks[Groups[i]]["seq_methods"] + "\n"; + } - //attributes - out << "\t\t\t" + instrumentModel + "\n"; - out << "\t\t\t" + libStrategy + "\n"; - out << "\t\t\t" + libSource + "\n"; - out << "\t\t\t" + libSelection + "\n"; - out << "\t\t\t" + libLayout + "\n"; - - //////////////////bioSample info ///////////////////bioProject info - + out << "\t\t\t\n"; + out << "\t\t\t\t\n"; + out << "\t\t\t\t\t" + projectName + " \n"; + out << "\t\t\t\t\n"; + out << "\t\t\t\n"; + //////////////////bioSample info + out << "\t\t\t\n"; + out << "\t\t\t\t\n"; + out << "\t\t\t\t\t" + Groups[i] + " \n"; + out << "\t\t\t\t\n"; + out << "\t\t\t\n"; //libID out << "\t\t\t\n"; - string libId = thisGroupsFiles[j] + barcodeForThisSample; if (libLayout == "paired") { //adjust the libID because the thisGroupsFiles[j] contains two filenames vector pieces = m->splitWhiteSpace(thisGroupsFiles[j]); libId = pieces[0] + barcodeForThisSample; } - out << "\t\t\t\t" + libId + " \n"; + out << "\t\t\t\t" + libId + " \n"; out << "\t\t\t\n"; out << "\t\t\n"; out << "\t\n"; } } - - //////////////////////////////////////////////////////// out << "\n"; out.close(); @@ -508,7 +541,8 @@ int SRACommand::execute(){ //********************************************************************************************************************** int SRACommand::readContactFile(){ try { - lastName = ""; firstName = ""; submissionName = ""; email = ""; centerName = ""; centerType = ""; description = ""; + lastName = ""; firstName = ""; submissionName = ""; email = ""; centerName = ""; centerType = ""; description = ""; website = ""; projectName = ""; + projectTitle = ""; grantAgency = ""; grantId = ""; grantTitle = ""; ifstream in; m->openInputFile(contactfile, in); @@ -523,28 +557,36 @@ int SRACommand::readContactFile(){ for (int i = 0; i < key.length(); i++) { key[i] = toupper(key[i]); } - if (key == "USERNAME") { submissionName = value; } - else if (key == "LAST") { lastName = value; } - else if (key == "FIRST") { firstName = value; } - else if (key == "EMAIL") { email = value; } - else if (key == "CENTER") { centerName = value; } - else if (key == "TYPE") { + if (key == "USERNAME") { submissionName = value; } + else if (key == "LAST") { lastName = value; } + else if (key == "FIRST") { firstName = value; } + else if (key == "EMAIL") { email = value; } + else if (key == "CENTER") { centerName = value; } + else if (key == "TYPE") { centerType = value; for (int i = 0; i < centerType.length(); i++) { centerType[i] = tolower(centerType[i]); } if ((centerType == "consortium") || (centerType == "center") || (centerType == "institute") || (centerType == "lab")) {} else { m->mothurOut("[ERROR]: " + centerType + " is not a center type option. Valid center type options are consortium, center, institute and lab. This is a controlled vocabulary section in the XML file that will be generated."); m->mothurOutEndLine(); m->control_pressed = true; } }else if (key == "DESCRIPTION") { description = value; } + else if (key == "WEBSITE") { website = value; } + else if (key == "PROJECTNAME") { projectName = value; } + else if (key == "PROJECTTITLE") { projectTitle = value; } + else if (key == "GRANTID") { grantId = value; } + else if (key == "GRANTTITLE") { grantTitle = value; } + else if (key == "GRANTAGENCY") { grantAgency = value; } } in.close(); - if (lastName == "") { m->mothurOut("[ERROR]: missing last name from contacts file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } - if (firstName == "") { m->mothurOut("[ERROR]: missing first name from contacts file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } - if (submissionName == "") { m->mothurOut("[ERROR]: missing submission name from contacts file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } - if (email == "") { m->mothurOut("[ERROR]: missing email from contacts file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } - if (centerName == "") { m->mothurOut("[ERROR]: missing center name from contacts file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } - if (centerType == "") { m->mothurOut("[ERROR]: missing center type from contacts file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } - if (description == "") { m->mothurOut("[ERROR]: missing description from contacts file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } - + if (lastName == "") { m->mothurOut("[ERROR]: missing last name from project file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } + if (firstName == "") { m->mothurOut("[ERROR]: missing first name from project file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } + if (submissionName == "") { m->mothurOut("[ERROR]: missing submission name from project file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } + if (email == "") { m->mothurOut("[ERROR]: missing email from project file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } + if (centerName == "") { m->mothurOut("[ERROR]: missing center name from project file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } + if (centerType == "") { m->mothurOut("[ERROR]: missing center type from project file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } + if (description == "") { m->mothurOut("[ERROR]: missing description from project file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } + if (projectTitle == "") { m->mothurOut("[ERROR]: missing project title from project file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } + if (projectName == "") { m->mothurOut("[ERROR]: missing project name from project file, quitting."); m->mothurOutEndLine(); m->control_pressed = true; } + return 0; } catch(exception& e) { @@ -552,6 +594,169 @@ int SRACommand::readContactFile(){ exit(1); } } +//********************************************************************************************************************** +//air, host_associated, human_associated, human_gut, human_oral, human_skin, human_vaginal, microbial, miscellaneous, plant_associated, sediment, soil, wastewater or water +//all packages require: *sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon +//air: *altitude +//host_associated, human_associated, human_gut, human_oral, human_skin, human_vaginal, plant_associated: *host +//microbial, sediment, soil: *depth *elev +//water: *depth +int SRACommand::readMIMarksFile(){ + try { + //acceptable organisms + vector acceptableOrganisms; + bool organismError = false; + //ecological + acceptableOrganisms.push_back("activated carbon metagenome"); acceptableOrganisms.push_back("activated sludge metagenome"); acceptableOrganisms.push_back("air metagenome"); acceptableOrganisms.push_back("anaerobic digester metagenome"); acceptableOrganisms.push_back("ant fungus garden metagenome"); acceptableOrganisms.push_back("aquatic metagenome"); acceptableOrganisms.push_back("activated carbon metagenome"); acceptableOrganisms.push_back("activated sludge metagenome"); acceptableOrganisms.push_back("beach sand metagenome"); acceptableOrganisms.push_back("biofilm metagenome"); acceptableOrganisms.push_back("biofilter metagenome"); acceptableOrganisms.push_back("biogas fermenter metagenome"); acceptableOrganisms.push_back("bioreactor metagenome"); acceptableOrganisms.push_back("bioreactor sludge metagenome"); acceptableOrganisms.push_back("clinical metagenome"); acceptableOrganisms.push_back("coal metagenome"); acceptableOrganisms.push_back("compost metagenome"); acceptableOrganisms.push_back("dust metagenome"); acceptableOrganisms.push_back("fermentation metagenome"); acceptableOrganisms.push_back("food fermentation metagenome"); acceptableOrganisms.push_back("food metagenome"); acceptableOrganisms.push_back("freshwater metagenome"); acceptableOrganisms.push_back("freshwater sediment metagenome"); acceptableOrganisms.push_back("groundwater metagenome"); acceptableOrganisms.push_back("halite metagenome"); acceptableOrganisms.push_back("hot springs metagenome"); acceptableOrganisms.push_back("hydrocarbon metagenome"); acceptableOrganisms.push_back("hydrothermal vent metagenome"); acceptableOrganisms.push_back("hypersaline lake metagenome"); acceptableOrganisms.push_back("ice metagenome"); acceptableOrganisms.push_back("indoor metagenome"); acceptableOrganisms.push_back("industrial waste metagenome"); acceptableOrganisms.push_back("mangrove metagenome"); acceptableOrganisms.push_back("marine metagenome"); acceptableOrganisms.push_back("marine sediment metagenome"); acceptableOrganisms.push_back("microbial mat metagenome"); acceptableOrganisms.push_back("mine drainage metagenome"); acceptableOrganisms.push_back("mixed culture metagenome"); acceptableOrganisms.push_back("oil production facility metagenome"); acceptableOrganisms.push_back("paper pulp metagenome"); acceptableOrganisms.push_back("permafrost metagenome"); acceptableOrganisms.push_back("plastisphere metagenome"); acceptableOrganisms.push_back("power plant metagenome"); acceptableOrganisms.push_back("retting rhizosphere metagenome"); acceptableOrganisms.push_back("rock metagenome"); acceptableOrganisms.push_back("salt lake metagenome"); acceptableOrganisms.push_back("saltern metagenome"); acceptableOrganisms.push_back("sediment metagenome"); acceptableOrganisms.push_back("snow metagenome"); acceptableOrganisms.push_back("soil metagenome"); acceptableOrganisms.push_back("stromatolite metagenome"); acceptableOrganisms.push_back("terrestrial metagenome"); acceptableOrganisms.push_back("tomb wall metagenome"); acceptableOrganisms.push_back("wastewater metagenome"); acceptableOrganisms.push_back("wetland metagenome"); acceptableOrganisms.push_back("whale fall metagenome"); + //oganismal + acceptableOrganisms.push_back("algae metagenome"); acceptableOrganisms.push_back("ant metagenome"); acceptableOrganisms.push_back("bat metagenome"); acceptableOrganisms.push_back("beetle metagenome"); acceptableOrganisms.push_back("bovine gut metagenome"); acceptableOrganisms.push_back("bovine metagenome"); acceptableOrganisms.push_back("chicken gut metagenome"); acceptableOrganisms.push_back("coral metagenome"); acceptableOrganisms.push_back("echinoderm metagenome"); acceptableOrganisms.push_back("endophyte metagenome"); acceptableOrganisms.push_back("epibiont metagenome"); acceptableOrganisms.push_back("fish metagenome"); acceptableOrganisms.push_back("fossil metagenome"); acceptableOrganisms.push_back("gill metagenome"); acceptableOrganisms.push_back("gut metagenome"); acceptableOrganisms.push_back("honeybee metagenome"); acceptableOrganisms.push_back("human gut metagenome"); acceptableOrganisms.push_back("human lung metagenome"); acceptableOrganisms.push_back("human metagenome"); acceptableOrganisms.push_back("human nasal/pharyngeal metagenome"); acceptableOrganisms.push_back("human oral metagenome"); acceptableOrganisms.push_back("human skin metagenome"); acceptableOrganisms.push_back("insect gut metagenome"); acceptableOrganisms.push_back("insect metagenome"); acceptableOrganisms.push_back("mollusc metagenome"); acceptableOrganisms.push_back("mosquito metagenome"); acceptableOrganisms.push_back("mouse gut metagenome"); acceptableOrganisms.push_back("mouse metagenome"); acceptableOrganisms.push_back("mouse skin metagenome"); acceptableOrganisms.push_back("nematode metagenome"); acceptableOrganisms.push_back("oral metagenome"); acceptableOrganisms.push_back("phyllosphere metagenome"); acceptableOrganisms.push_back("pig metagenome"); acceptableOrganisms.push_back("plant metagenome"); acceptableOrganisms.push_back("primate metagenome"); acceptableOrganisms.push_back("rat metagenome"); acceptableOrganisms.push_back("root metagenome"); acceptableOrganisms.push_back("sea squirt metagenome"); acceptableOrganisms.push_back("seed metagenome"); acceptableOrganisms.push_back("shoot metagenome"); acceptableOrganisms.push_back("skin metagenome"); acceptableOrganisms.push_back("snake metagenome"); acceptableOrganisms.push_back("sponge metagenome"); acceptableOrganisms.push_back("stomach metagenome"); acceptableOrganisms.push_back("symbiont metagenome"); acceptableOrganisms.push_back("termite gut metagenome"); acceptableOrganisms.push_back("termite metagenome"); acceptableOrganisms.push_back("upper respiratory tract metagenome"); acceptableOrganisms.push_back("urine metagenome"); acceptableOrganisms.push_back("viral metagenome"); acceptableOrganisms.push_back("wallaby gut metagenome"); acceptableOrganisms.push_back("wasp metagenome"); acceptableOrganisms.push_back("sythetic metagenome"); acceptableOrganisms.push_back("metagenome"); + + vector requiredFieldsForPackage; + requiredFieldsForPackage.push_back("sample_name"); requiredFieldsForPackage.push_back("organism"); + requiredFieldsForPackage.push_back("collection_date"); requiredFieldsForPackage.push_back("biome"); + requiredFieldsForPackage.push_back("feature"); requiredFieldsForPackage.push_back("material"); + requiredFieldsForPackage.push_back("geo_loc_name"); requiredFieldsForPackage.push_back("lat_lon"); + requiredFieldsForPackage.push_back("seq_methods"); requiredFieldsForPackage.push_back("title"); + vector chooseAtLeastOneForPackage; + + ifstream in; + m->openInputFile(mimarksfile, in); + + //read comments + string temp; packageType = ""; + while(!in.eof()) { + + if (m->control_pressed) { break; } + temp = m->getline(in); m->gobble(in); + + if (m->debug) { m->mothurOut("[DEBUG]: " + temp + "\n"); } + + if (temp[0] == '#') { + int pos = temp.find("Environmental"); + if (pos != string::npos) { + for (int i = pos+14; i < temp.length(); i++) { + if (!isspace(temp[i])) { packageType += temp[i]; } + else { i+= temp.length(); } + } + } + } + else{ break; } //hit headers line + } + + vector headers; m->splitAtChar(temp, headers, '\t'); + m->removeBlanks(headers); + //remove * from required's + for (int i = 0; i < headers.size(); i++) { + if (headers[i][0] == '*') { headers[i] = headers[i].substr(1); } + if (headers[i][0] == '*') { headers[i] = headers[i].substr(1); chooseAtLeastOneForPackage.push_back(headers[i]); } //secondary condition + if (m->debug) { m->mothurOut("[DEBUG]: " + headers[i] + "\n"); } + } + + if (m->debug) { m->mothurOut("[DEBUG]: packageType = '" + packageType + "'\n"); } + + //check to make sure package has all its required parts + //MIMARKS.specimen.water.3.0 + if (packageType == "MIMARKS.specimen.air.3.0") { requiredFieldsForPackage.push_back("altitude"); } + else if ((packageType == "MIMARKS.specimen.host-associated.3.0") || (packageType == "MIMARKS.specimen.human-associated.3.0") || (packageType == "MIMARKS.specimen.human-gut.3.0") || (packageType == "MIMARKS.specimen.human-oral.3.0") || (packageType == "MIMARKS.specimen.human-skin.3.0") || (packageType == "MIMARKS.specimen.human-vaginal.3.0") || (packageType == "MIMARKS.specimen.plant-associated.3.0")) { requiredFieldsForPackage.push_back("host"); } + else if ((packageType == "MIMARKS.specimen.microbial.3.0") || (packageType == "MIMARKS.specimen.sediment.3.0") || (packageType == "soil")) { requiredFieldsForPackage.push_back("depth"); requiredFieldsForPackage.push_back("elev"); } + else if (packageType == "MIMARKS.specimen.water.3.0") { requiredFieldsForPackage.push_back("depth"); } + else if ((packageType == "MIMARKS.specimen.miscellaneous.3.0") || (packageType == "wastewater")) { } + else { + m->mothurOut("[ERROR]: unknown package " + packageType + ", please correct.\n"); m->control_pressed = true; in.close(); return 0; + } + + if (!m->isSubset(headers, requiredFieldsForPackage)){ + string requiredFields = ""; + for (int i = 0; i < requiredFieldsForPackage.size()-1; i++) { requiredFields += requiredFieldsForPackage[i] + ", "; } requiredFields += requiredFieldsForPackage[requiredFieldsForPackage.size()-1]; + m->mothurOut("[ERROR]: missing required fields for package, please correct. Required fields are " + requiredFields + ".\n"); m->control_pressed = true; in.close(); return 0; + } + + if (m->debug) { m->mothurOut("[DEBUG]: chooseAtLeastOneForPackage.size() = " + toString(chooseAtLeastOneForPackage.size()) + "\n"); } + + if (!m->inUsersGroups(chooseAtLeastOneForPackage, headers)){ //returns true if any of the choose at least ones are in headers + string requiredFields = ""; + for (int i = 0; i < chooseAtLeastOneForPackage.size()-1; i++) { requiredFields += chooseAtLeastOneForPackage[i] + ", "; cout << chooseAtLeastOneForPackage[i] << endl; } + if (chooseAtLeastOneForPackage.size() < 1) { requiredFields += chooseAtLeastOneForPackage[chooseAtLeastOneForPackage.size()-1]; } + m->mothurOut("[ERROR]: missing a choose at least one fields for the package, please correct. These are marked with '**'. Required fields are " + requiredFields + ".\n"); m->control_pressed = true; in.close(); return 0; + } + + map allNA; for (int i = 1; i < headers.size(); i++) { allNA[headers[i]] = true; } + while(!in.eof()) { + + if (m->control_pressed) { break; } + + temp = m->getline(in); m->gobble(in); + + if (m->debug) { m->mothurOut("[DEBUG]: " + temp + "\n"); } + + string original = temp; + vector linePieces; m->splitAtChar(temp, linePieces, '\t'); + m->removeBlanks(linePieces); + + if (linePieces.size() != headers.size()) { m->mothurOut("[ERROR]: line: " + original + " contains " + toString(linePieces.size()) + " columns, but you have " + toString(headers.size()) + " column headers, please correct.\n"); m->control_pressed = true; } + else { + map >:: iterator it = mimarks.find(linePieces[0]); + + if (it == mimarks.end()) { + map categories; + //start after *sample_name + for (int i = 1; i < headers.size(); i++) { + categories[headers[i]] = linePieces[i]; + //check the users inputs for appropriate organisms + if (headers[i] == "organism") { + if (!m->inUsersGroups(linePieces[i], acceptableOrganisms)) { //not an acceptable organism + organismError = true; + m->mothurOut("[WARNING]: " + linePieces[i]+ " is not an acceptable organism, changing to metagenome. You can correct the issue and rerun the command, or NCBI will allow you to modify the organism after submission.\n"); linePieces[i] = "metagenome"; categories[headers[i]] = linePieces[i]; + } + Group2Organism[linePieces[0]] = linePieces[i]; + } + if (linePieces[i] != "NA") { allNA[headers[i]] = false; } + } + + //does this sample already match an existing sample? + bool isOkaySample = true; + for (map >:: iterator it2 = mimarks.begin(); it2 != mimarks.end(); it2++) { + if (m->control_pressed) { break; } + bool allSame = true; + for (int i = 1; i < headers.size(); i++) { + if ((it2->second)[headers[i]] != categories[headers[i]]) { allSame = false; } + } + if (allSame) { m->mothurOut("[ERROR]: " + linePieces[0]+ " is a duplicate sample to " + it2->first + ". It has all the same attributes in the MIMarks file. Samples must have distinguishing features to be uploaded to the NCBI library, please correct.\n"); m->control_pressed = true; isOkaySample = false; } + } + if (isOkaySample) { mimarks[linePieces[0]] = categories; } + }else { + m->mothurOut("[ERROR]: " + linePieces[0]+ " is a duplicate sampleName. Sample names must be unique, please correct.\n"); m->control_pressed = true; + } + } + } + in.close(); + + //add in values for "scrap" group + map categories; + //start after *sample_name + for (int i = 1; i < headers.size(); i++) { + categories[headers[i]] = "NA"; + if (headers[i] == "organism") { categories[headers[i]] = "metagenome"; } + if (headers[i] == "seq_methods") { categories[headers[i]] = "these sequences were scrapped"; } + if (headers[i] == "title") { categories[headers[i]] = "these sequences were scrapped"; } + } + mimarks["scrap"] = categories; + Group2Organism["scrap"] = "metagenome"; + + if (organismError) { + string organismTypes = ""; + for (int i = 0; i < acceptableOrganisms.size()-1; i++) { organismTypes += acceptableOrganisms[i] + ", "; } + organismTypes += acceptableOrganisms[acceptableOrganisms.size()-1]; + m->mothurOut("[WARNING]: The acceptable organism choices are: " + organismTypes + ".\n"); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SRACommand", "readMIMarksFile"); + exit(1); + } +} //********************************************************************************************************************** // going to have to rework this to allow for other options -- @@ -579,7 +784,7 @@ int SRACommand::readContactFile(){ int SRACommand::readFile(map >& files){ try { - vector theseFiles; + //vector theseFiles; inputfile = file; files.clear(); @@ -641,8 +846,10 @@ int SRACommand::readFile(map >& files){ m->mothurOut("[WARNING]: can't find " + thisFileName1 + ", ignoring.\n"); }else{ in2.close(); } + int openReverse = 1; + ifstream in3; - int openReverse = m->openInputFile(thisFileName2, in3, "noerror"); + openReverse = m->openInputFile(thisFileName2, in3, "noerror"); //if you can't open it, try default location if (openReverse == 1) { @@ -671,22 +878,27 @@ int SRACommand::readFile(map >& files){ if (openReverse == 1) { //can't find it m->mothurOut("[WARNING]: can't find " + thisFileName2 + ", ignoring pair.\n"); }else{ in3.close(); } - - + if ((pieces.size() == 2) && (openForward != 1) && (openReverse != 1)) { //good pair and sff or fastq and oligos //process pair - int pos = theseFiles[0].find(".sff"); + int pos = thisFileName1.find(".sff"); if (pos != string::npos) {//these files are sff files isSFF = true; sfffile = thisFileName1; oligosfile = thisFileName2; + if (m->debug) { m->mothurOut("[DEBUG]: about to read oligos\n"); } readOligos(); + if (m->debug) { m->mothurOut("[DEBUG]: about to parse\n"); } parseSffFile(files); + if (m->debug) { m->mothurOut("[DEBUG]: done parsing " + sfffile + "\n"); } }else{ isSFF = false; fastqfile = thisFileName1; oligosfile = thisFileName2; + if (m->debug) { m->mothurOut("[DEBUG]: about to read oligos\n"); } readOligos(); + if (m->debug) { m->mothurOut("[DEBUG]: about to parse\n"); } parseFastqFile(files); + if (m->debug) { m->mothurOut("[DEBUG]: done parsing " + fastqfile + "\n"); } } }else if((pieces.size() == 3) && (openForward != 1) && (openReverse != 1)) { //good pair and paired read @@ -719,16 +931,15 @@ int SRACommand::parseSffFile(map >& files){ isSFF = true; //run sffinfo to parse sff file into individual sampled sff files string commandString = "sff=" + sfffile; - if (groupfile != "") { commandString += ", group=" + groupfile; } - else if (oligosfile != "") { - commandString += ", oligos=" + oligosfile; - //add in pdiffs, bdiffs, ldiffs, sdiffs, tdiffs - if (pdiffs != 0) { commandString += ", pdiffs=" + toString(pdiffs); } - if (bdiffs != 0) { commandString += ", bdiffs=" + toString(bdiffs); } - if (ldiffs != 0) { commandString += ", ldiffs=" + toString(ldiffs); } - if (sdiffs != 0) { commandString += ", sdiffs=" + toString(sdiffs); } - if (tdiffs != 0) { commandString += ", tdiffs=" + toString(tdiffs); } - } + + commandString += ", oligos=" + oligosfile; + //add in pdiffs, bdiffs, ldiffs, sdiffs, tdiffs + if (pdiffs != 0) { commandString += ", pdiffs=" + toString(pdiffs); } + if (bdiffs != 0) { commandString += ", bdiffs=" + toString(bdiffs); } + if (ldiffs != 0) { commandString += ", ldiffs=" + toString(ldiffs); } + if (sdiffs != 0) { commandString += ", sdiffs=" + toString(sdiffs); } + if (tdiffs != 0) { commandString += ", tdiffs=" + toString(tdiffs); } + m->mothurOutEndLine(); m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Running command: sffinfo(" + commandString + ")"); m->mothurOutEndLine(); @@ -765,16 +976,15 @@ int SRACommand::parseFastqFile(map >& files){ //run sffinfo to parse sff file into individual sampled sff files string commandString = "fastq=" + fastqfile; - if (groupfile != "") { commandString += ", group=" + groupfile; } - else if (oligosfile != "") { - commandString += ", oligos=" + oligosfile; - //add in pdiffs, bdiffs, ldiffs, sdiffs, tdiffs - if (pdiffs != 0) { commandString += ", pdiffs=" + toString(pdiffs); } - if (bdiffs != 0) { commandString += ", bdiffs=" + toString(bdiffs); } - if (ldiffs != 0) { commandString += ", ldiffs=" + toString(ldiffs); } - if (sdiffs != 0) { commandString += ", sdiffs=" + toString(sdiffs); } - if (tdiffs != 0) { commandString += ", tdiffs=" + toString(tdiffs); } - } + + commandString += ", oligos=" + oligosfile; + //add in pdiffs, bdiffs, ldiffs, sdiffs, tdiffs + if (pdiffs != 0) { commandString += ", pdiffs=" + toString(pdiffs); } + if (bdiffs != 0) { commandString += ", bdiffs=" + toString(bdiffs); } + if (ldiffs != 0) { commandString += ", ldiffs=" + toString(ldiffs); } + if (sdiffs != 0) { commandString += ", sdiffs=" + toString(sdiffs); } + if (tdiffs != 0) { commandString += ", tdiffs=" + toString(tdiffs); } + m->mothurOutEndLine(); m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Running command: fastq.info(" + commandString + ")"); m->mothurOutEndLine(); @@ -865,7 +1075,12 @@ int SRACommand::readOligos(){ string type, oligo, roligo, group; bool hasPrimer = false; bool hasPairedBarcodes = false; pairedOligos = false; - + map pairedBarcodes; + map pairedPrimers; + map barcodes; + map primers; + vector linker; + vector spacer, revPrimer; int indexPrimer = 0; int indexBarcode = 0; int indexPairedPrimer = 0; @@ -1055,6 +1270,7 @@ int SRACommand::readOligos(){ } uniqueNames.insert(comboGroupName); Group2Barcode[comboGroupName] = (itBar->second).forward+"."+(itBar->second).reverse; + Group2Primer[comboGroupName] = (itPrimer->second).forward+"."+(itPrimer->second).reverse; } } } @@ -1083,6 +1299,7 @@ int SRACommand::readOligos(){ } uniqueNames.insert(comboGroupName); Group2Barcode[comboGroupName] = itBar->first; + Group2Primer[comboGroupName] = itPrimer->first; } } } @@ -1270,6 +1487,7 @@ bool SRACommand::checkCasesLibStrategy(string& libStrategy){ exit(1); } } + //********************************************************************************************************************** //METAGENOMIC,GENOMIC,TRANSCRIPTOMIC,METATRANSCRIPTOMIC,SYNTHETIC,VIRAL_RNA,OTHER bool SRACommand::checkCasesLibSource(string& libSource){ @@ -1336,5 +1554,55 @@ bool SRACommand::checkCasesLibSelection(string& libSelection){ exit(1); } } +//********************************************************************************************************************** +//METAGENOME,GENOME_SEQUENCING,METAGENOMIC_ASSEMBLY,ASSEMBLY,TRANSCRIPTOME,PROTEOMIC,MAP,CLONE_ENDS,TARGETED_LOCI,RANDOM_SURVEY,EXOME,VARIATION,EPIGENOMICS,PHENOTYPE,GENOTYPE,OTHER +bool SRACommand::checkCasesDataType(string& dataType){ + try { + string original = dataType; + bool isOkay = true; + + //remove users possible case errors + for (int i = 0; i < dataType.size(); i++) { dataType[i] = toupper(dataType[i]); } + + if ((dataType == "METAGENOME") || (dataType == "GENOME_SEQUENCING") || (dataType == "METAGENOMIC_ASSEMBLY") || (dataType == "ASSEMBLY") || (dataType == "TRANSCRIPTOME") || (dataType == "PROTEOMIC") || (dataType == "MAP") || (dataType == "CLONE_ENDS") || (dataType == "TARGETED_LOCI") || (dataType == "RANDOM_SURVEY") || (dataType == "EXOME") || (dataType == "VARIATION") || (dataType == "EPIGENOMICS") || (dataType == "PHENOTYPE") || (dataType == "GENOTYPE") || (dataType == "OTHER")) { } + else { isOkay = false; } + + if (isOkay) { + + }else { + m->mothurOut("[ERROR]: " + original + " is not a valid datatype option. Valid datatype options are METAGENOME,GENOME_SEQUENCING,METAGENOMIC_ASSEMBLY,ASSEMBLY,TRANSCRIPTOME,PROTEOMIC,MAP,CLONE_ENDS,TARGETED_LOCI,RANDOM_SURVEY,EXOME,VARIATION,EPIGENOMICS,PHENOTYPE,GENOTYPE,OTHER."); m->mothurOutEndLine(); abort = true; + } + + return isOkay; + } + catch(exception& e) { + m->errorOut(e, "SRACommand", "checkCasesDataType"); + exit(1); + } +} +//********************************************************************************************************************** +bool SRACommand::sanityCheckMiMarksGroups(){ + try { + bool isOkay = true; + + for (int i = 0; i < Groups.size(); i++) { + if (m->control_pressed) { break; } + + map >::iterator it = mimarks.find(Groups[i]); + if (it == mimarks.end()) { + isOkay = false; + m->mothurOut("[ERROR]: MIMarks file is missing group " + Groups[i] + ", please correct.\n"); + } + } + + if (!isOkay) { m->control_pressed = true; } + + return isOkay; + } + catch(exception& e) { + m->errorOut(e, "SRACommand", "sanityCheckMiMarksGroups"); + exit(1); + } +} //********************************************************************************************************************** diff --git a/sracommand.h b/sracommand.h index 5cc3714..c53a17d 100644 --- a/sracommand.h +++ b/sracommand.h @@ -36,27 +36,28 @@ public: private: bool abort, isSFF, pairedOligos; int tdiffs, bdiffs, pdiffs, sdiffs, ldiffs; - string sfffile, fastqfile, outputDir, groupfile, file, oligosfile, contactfile, inputfile; - string libStrategy, libSource, libSelection, libLayout, platform, instrumentModel, fileType; - string submissionName, lastName, firstName, email, centerName, centerType, description; - vector outputNames, Groups, revPrimer; + string sfffile, fastqfile, outputDir, file, oligosfile, contactfile, inputfile, mimarksfile; + string libStrategy, libSource, libSelection, libLayout, platform, instrumentModel, fileType, dataType; + string submissionName, lastName, firstName, email, centerName, centerType, description, website, orientation, packageType; + string projectName, grantId, grantTitle, grantAgency, projectTitle; + vector outputNames, Groups; vector primerNameVector; vector barcodeNameVector; map Group2Barcode; - map pairedBarcodes; - map pairedPrimers; - map barcodes; - map primers; - vector linker; - vector spacer; + map Group2Primer; + map Group2Organism; + map > mimarks; //group -> valueForGroup> ex. F003D001 -> 42.282026 -83.733850> bool checkCasesInstrumentModels(string&); bool checkCasesPlatforms(string&); bool checkCasesLibStrategy(string&); bool checkCasesLibSource(string&); bool checkCasesLibSelection(string&); + bool checkCasesDataType(string&); + bool sanityCheckMiMarksGroups(); int readFile(map >&); int readContactFile(); + int readMIMarksFile(); int readOligos(); int parseSffFile(map >&); int parseFastqFile(map >&);