+//**********************************************************************************************************************
+//air, host_associated, human_associated, human_gut, human_oral, human_skin, human_vaginal, microbial, miscellaneous, plant_associated, sediment, soil, wastewater or water
+//all packages require: *sample_name *organism *collection_date *biome *feature *material *geo_loc_name *lat_lon
+//air: *altitude
+//host_associated, human_associated, human_gut, human_oral, human_skin, human_vaginal, plant_associated: *host
+//microbial, sediment, soil: *depth *elev
+//water: *depth
+int SRACommand::readMIMarksFile(){
+ try {
+ //acceptable organisms
+ vector<string> acceptableOrganisms;
+ bool organismError = false;
+ //ecological
+ acceptableOrganisms.push_back("activated carbon metagenome"); acceptableOrganisms.push_back("activated sludge metagenome"); acceptableOrganisms.push_back("air metagenome"); acceptableOrganisms.push_back("anaerobic digester metagenome"); acceptableOrganisms.push_back("ant fungus garden metagenome"); acceptableOrganisms.push_back("aquatic metagenome"); acceptableOrganisms.push_back("activated carbon metagenome"); acceptableOrganisms.push_back("activated sludge metagenome"); acceptableOrganisms.push_back("beach sand metagenome"); acceptableOrganisms.push_back("biofilm metagenome"); acceptableOrganisms.push_back("biofilter metagenome"); acceptableOrganisms.push_back("biogas fermenter metagenome"); acceptableOrganisms.push_back("bioreactor metagenome"); acceptableOrganisms.push_back("bioreactor sludge metagenome"); acceptableOrganisms.push_back("clinical metagenome"); acceptableOrganisms.push_back("coal metagenome"); acceptableOrganisms.push_back("compost metagenome"); acceptableOrganisms.push_back("dust metagenome"); acceptableOrganisms.push_back("fermentation metagenome"); acceptableOrganisms.push_back("food fermentation metagenome"); acceptableOrganisms.push_back("food metagenome"); acceptableOrganisms.push_back("freshwater metagenome"); acceptableOrganisms.push_back("freshwater sediment metagenome"); acceptableOrganisms.push_back("groundwater metagenome"); acceptableOrganisms.push_back("halite metagenome"); acceptableOrganisms.push_back("hot springs metagenome"); acceptableOrganisms.push_back("hydrocarbon metagenome"); acceptableOrganisms.push_back("hydrothermal vent metagenome"); acceptableOrganisms.push_back("hypersaline lake metagenome"); acceptableOrganisms.push_back("ice metagenome"); acceptableOrganisms.push_back("indoor metagenome"); acceptableOrganisms.push_back("industrial waste metagenome"); acceptableOrganisms.push_back("mangrove metagenome"); acceptableOrganisms.push_back("marine metagenome"); acceptableOrganisms.push_back("marine sediment metagenome"); acceptableOrganisms.push_back("microbial mat metagenome"); acceptableOrganisms.push_back("mine drainage metagenome"); acceptableOrganisms.push_back("mixed culture metagenome"); acceptableOrganisms.push_back("oil production facility metagenome"); acceptableOrganisms.push_back("paper pulp metagenome"); acceptableOrganisms.push_back("permafrost metagenome"); acceptableOrganisms.push_back("plastisphere metagenome"); acceptableOrganisms.push_back("power plant metagenome"); acceptableOrganisms.push_back("retting rhizosphere metagenome"); acceptableOrganisms.push_back("rock metagenome"); acceptableOrganisms.push_back("salt lake metagenome"); acceptableOrganisms.push_back("saltern metagenome"); acceptableOrganisms.push_back("sediment metagenome"); acceptableOrganisms.push_back("snow metagenome"); acceptableOrganisms.push_back("soil metagenome"); acceptableOrganisms.push_back("stromatolite metagenome"); acceptableOrganisms.push_back("terrestrial metagenome"); acceptableOrganisms.push_back("tomb wall metagenome"); acceptableOrganisms.push_back("wastewater metagenome"); acceptableOrganisms.push_back("wetland metagenome"); acceptableOrganisms.push_back("whale fall metagenome");
+ //oganismal
+ acceptableOrganisms.push_back("algae metagenome"); acceptableOrganisms.push_back("ant metagenome"); acceptableOrganisms.push_back("bat metagenome"); acceptableOrganisms.push_back("beetle metagenome"); acceptableOrganisms.push_back("bovine gut metagenome"); acceptableOrganisms.push_back("bovine metagenome"); acceptableOrganisms.push_back("chicken gut metagenome"); acceptableOrganisms.push_back("coral metagenome"); acceptableOrganisms.push_back("echinoderm metagenome"); acceptableOrganisms.push_back("endophyte metagenome"); acceptableOrganisms.push_back("epibiont metagenome"); acceptableOrganisms.push_back("fish metagenome"); acceptableOrganisms.push_back("fossil metagenome"); acceptableOrganisms.push_back("gill metagenome"); acceptableOrganisms.push_back("gut metagenome"); acceptableOrganisms.push_back("honeybee metagenome"); acceptableOrganisms.push_back("human gut metagenome"); acceptableOrganisms.push_back("human lung metagenome"); acceptableOrganisms.push_back("human metagenome"); acceptableOrganisms.push_back("human nasal/pharyngeal metagenome"); acceptableOrganisms.push_back("human oral metagenome"); acceptableOrganisms.push_back("human skin metagenome"); acceptableOrganisms.push_back("insect gut metagenome"); acceptableOrganisms.push_back("insect metagenome"); acceptableOrganisms.push_back("mollusc metagenome"); acceptableOrganisms.push_back("mosquito metagenome"); acceptableOrganisms.push_back("mouse gut metagenome"); acceptableOrganisms.push_back("mouse metagenome"); acceptableOrganisms.push_back("mouse skin metagenome"); acceptableOrganisms.push_back("nematode metagenome"); acceptableOrganisms.push_back("oral metagenome"); acceptableOrganisms.push_back("phyllosphere metagenome"); acceptableOrganisms.push_back("pig metagenome"); acceptableOrganisms.push_back("plant metagenome"); acceptableOrganisms.push_back("primate metagenome"); acceptableOrganisms.push_back("rat metagenome"); acceptableOrganisms.push_back("root metagenome"); acceptableOrganisms.push_back("sea squirt metagenome"); acceptableOrganisms.push_back("seed metagenome"); acceptableOrganisms.push_back("shoot metagenome"); acceptableOrganisms.push_back("skin metagenome"); acceptableOrganisms.push_back("snake metagenome"); acceptableOrganisms.push_back("sponge metagenome"); acceptableOrganisms.push_back("stomach metagenome"); acceptableOrganisms.push_back("symbiont metagenome"); acceptableOrganisms.push_back("termite gut metagenome"); acceptableOrganisms.push_back("termite metagenome"); acceptableOrganisms.push_back("upper respiratory tract metagenome"); acceptableOrganisms.push_back("urine metagenome"); acceptableOrganisms.push_back("viral metagenome"); acceptableOrganisms.push_back("wallaby gut metagenome"); acceptableOrganisms.push_back("wasp metagenome"); acceptableOrganisms.push_back("sythetic metagenome"); acceptableOrganisms.push_back("metagenome");
+
+ vector<string> requiredFieldsForPackage;
+ requiredFieldsForPackage.push_back("sample_name"); requiredFieldsForPackage.push_back("organism");
+ requiredFieldsForPackage.push_back("collection_date"); requiredFieldsForPackage.push_back("biome");
+ requiredFieldsForPackage.push_back("feature"); requiredFieldsForPackage.push_back("material");
+ requiredFieldsForPackage.push_back("geo_loc_name"); requiredFieldsForPackage.push_back("lat_lon");
+ requiredFieldsForPackage.push_back("seq_methods"); requiredFieldsForPackage.push_back("title");
+ vector<string> chooseAtLeastOneForPackage;
+
+ ifstream in;
+ m->openInputFile(mimarksfile, in);
+
+ //read comments
+ string temp; packageType = "";
+ while(!in.eof()) {
+
+ if (m->control_pressed) { break; }
+ temp = m->getline(in); m->gobble(in);
+
+ if (m->debug) { m->mothurOut("[DEBUG]: " + temp + "\n"); }
+
+ if (temp[0] == '#') {
+ int pos = temp.find("Environmental");
+ if (pos != string::npos) {
+ for (int i = pos+14; i < temp.length(); i++) {
+ if (!isspace(temp[i])) { packageType += temp[i]; }
+ else { i+= temp.length(); }
+ }
+ }
+ }
+ else{ break; } //hit headers line
+ }
+
+ vector<string> headers; m->splitAtChar(temp, headers, '\t');
+ m->removeBlanks(headers);
+ //remove * from required's
+ for (int i = 0; i < headers.size(); i++) {
+ if (headers[i][0] == '*') { headers[i] = headers[i].substr(1); }
+ if (headers[i][0] == '*') { headers[i] = headers[i].substr(1); chooseAtLeastOneForPackage.push_back(headers[i]); } //secondary condition
+ if (m->debug) { m->mothurOut("[DEBUG]: " + headers[i] + "\n"); }
+ }
+
+ if (m->debug) { m->mothurOut("[DEBUG]: packageType = '" + packageType + "'\n"); }
+
+ //check to make sure package has all its required parts
+ //MIMARKS.specimen.water.3.0
+ if (packageType == "MIMARKS.specimen.air.3.0") { requiredFieldsForPackage.push_back("altitude"); }
+ else if ((packageType == "MIMARKS.specimen.host-associated.3.0") || (packageType == "MIMARKS.specimen.human-associated.3.0") || (packageType == "MIMARKS.specimen.human-gut.3.0") || (packageType == "MIMARKS.specimen.human-oral.3.0") || (packageType == "MIMARKS.specimen.human-skin.3.0") || (packageType == "MIMARKS.specimen.human-vaginal.3.0") || (packageType == "MIMARKS.specimen.plant-associated.3.0")) { requiredFieldsForPackage.push_back("host"); }
+ else if ((packageType == "MIMARKS.specimen.microbial.3.0") || (packageType == "MIMARKS.specimen.sediment.3.0") || (packageType == "soil")) { requiredFieldsForPackage.push_back("depth"); requiredFieldsForPackage.push_back("elev"); }
+ else if (packageType == "MIMARKS.specimen.water.3.0") { requiredFieldsForPackage.push_back("depth"); }
+ else if ((packageType == "MIMARKS.specimen.miscellaneous.3.0") || (packageType == "wastewater")) { }
+ else {
+ m->mothurOut("[ERROR]: unknown package " + packageType + ", please correct.\n"); m->control_pressed = true; in.close(); return 0;
+ }
+
+ if (!m->isSubset(headers, requiredFieldsForPackage)){
+ string requiredFields = "";
+ for (int i = 0; i < requiredFieldsForPackage.size()-1; i++) { requiredFields += requiredFieldsForPackage[i] + ", "; } requiredFields += requiredFieldsForPackage[requiredFieldsForPackage.size()-1];
+ m->mothurOut("[ERROR]: missing required fields for package, please correct. Required fields are " + requiredFields + ".\n"); m->control_pressed = true; in.close(); return 0;
+ }
+
+ if (m->debug) { m->mothurOut("[DEBUG]: chooseAtLeastOneForPackage.size() = " + toString(chooseAtLeastOneForPackage.size()) + "\n"); }
+
+ if (!m->inUsersGroups(chooseAtLeastOneForPackage, headers)){ //returns true if any of the choose at least ones are in headers
+ string requiredFields = "";
+ for (int i = 0; i < chooseAtLeastOneForPackage.size()-1; i++) { requiredFields += chooseAtLeastOneForPackage[i] + ", "; cout << chooseAtLeastOneForPackage[i] << endl; }
+ if (chooseAtLeastOneForPackage.size() < 1) { requiredFields += chooseAtLeastOneForPackage[chooseAtLeastOneForPackage.size()-1]; }
+ m->mothurOut("[ERROR]: missing a choose at least one fields for the package, please correct. These are marked with '**'. Required fields are " + requiredFields + ".\n"); m->control_pressed = true; in.close(); return 0;
+ }
+
+ map<string, bool> allNA; for (int i = 1; i < headers.size(); i++) { allNA[headers[i]] = true; }
+ while(!in.eof()) {
+
+ if (m->control_pressed) { break; }
+
+ temp = m->getline(in); m->gobble(in);
+
+ if (m->debug) { m->mothurOut("[DEBUG]: " + temp + "\n"); }
+
+ string original = temp;
+ vector<string> linePieces; m->splitAtChar(temp, linePieces, '\t');
+ m->removeBlanks(linePieces);
+
+ if (linePieces.size() != headers.size()) { m->mothurOut("[ERROR]: line: " + original + " contains " + toString(linePieces.size()) + " columns, but you have " + toString(headers.size()) + " column headers, please correct.\n"); m->control_pressed = true; }
+ else {
+ map<string, map<string, string> >:: iterator it = mimarks.find(linePieces[0]);
+
+ if (it == mimarks.end()) {
+ map<string, string> categories;
+ //start after *sample_name
+ for (int i = 1; i < headers.size(); i++) {
+ categories[headers[i]] = linePieces[i];
+ //check the users inputs for appropriate organisms
+ if (headers[i] == "organism") {
+ if (!m->inUsersGroups(linePieces[i], acceptableOrganisms)) { //not an acceptable organism
+ organismError = true;
+ m->mothurOut("[WARNING]: " + linePieces[i]+ " is not an acceptable organism, changing to metagenome. You can correct the issue and rerun the command, or NCBI will allow you to modify the organism after submission.\n"); linePieces[i] = "metagenome"; categories[headers[i]] = linePieces[i];
+ }
+ Group2Organism[linePieces[0]] = linePieces[i];
+ }
+ if (linePieces[i] != "NA") { allNA[headers[i]] = false; }
+ }
+
+ //does this sample already match an existing sample?
+ bool isOkaySample = true;
+ for (map<string, map<string, string> >:: iterator it2 = mimarks.begin(); it2 != mimarks.end(); it2++) {
+ if (m->control_pressed) { break; }
+ bool allSame = true;
+ for (int i = 1; i < headers.size(); i++) {
+ if ((it2->second)[headers[i]] != categories[headers[i]]) { allSame = false; }
+ }
+ if (allSame) { m->mothurOut("[ERROR]: " + linePieces[0]+ " is a duplicate sample to " + it2->first + ". It has all the same attributes in the MIMarks file. Samples must have distinguishing features to be uploaded to the NCBI library, please correct.\n"); m->control_pressed = true; isOkaySample = false; }
+ }
+ if (isOkaySample) { mimarks[linePieces[0]] = categories; }
+ }else {
+ m->mothurOut("[ERROR]: " + linePieces[0]+ " is a duplicate sampleName. Sample names must be unique, please correct.\n"); m->control_pressed = true;
+ }
+ }
+ }
+ in.close();
+
+ //add in values for "scrap" group
+ map<string, string> categories;
+ //start after *sample_name
+ for (int i = 1; i < headers.size(); i++) {
+ categories[headers[i]] = "NA";
+ if (headers[i] == "organism") { categories[headers[i]] = "metagenome"; }
+ if (headers[i] == "seq_methods") { categories[headers[i]] = "these sequences were scrapped"; }
+ if (headers[i] == "title") { categories[headers[i]] = "these sequences were scrapped"; }
+ }
+ mimarks["scrap"] = categories;
+ Group2Organism["scrap"] = "metagenome";
+
+ if (organismError) {
+ string organismTypes = "";
+ for (int i = 0; i < acceptableOrganisms.size()-1; i++) { organismTypes += acceptableOrganisms[i] + ", "; }
+ organismTypes += acceptableOrganisms[acceptableOrganisms.size()-1];
+ m->mothurOut("[WARNING]: The acceptable organism choices are: " + organismTypes + ".\n");
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SRACommand", "readMIMarksFile");
+ exit(1);
+ }
+}