Source code for olm.USGS.WQXtoPandas

#! /usr/bin/env python

"""
This script takes an input file in USGS/EPA WQX xml format and creates a multi-indexed
Pandas Dataframe that contains time series of water quality data and discharge combined
with layers that contain meta data for each data value. You can call the script from
the command line using WQXtoPandas [input WQX start file], or import the runWQXtoPandas
function for calling within a Python session.
"""

from __future__ import print_function
import sys
import xlrd
import os
import time
import argparse
import requests
from glob import glob
from math import ceil
import pickle as pickle
from lxml import etree
from pandas import DataFrame, to_datetime, concat, ExcelWriter
from olm.USGS.PhreeqcPandas import processMidf

# import functions from olm package
from olm.USGS.siteListExtraction import extractSitesFromXML
from olm.USGS.siteListExtraction import extractSitesFromText
from olm.USGS.DataRetrieval import querySiteList, GetDailyDischarge, GetSiteData
from olm.USGS.dataSlice import extractValues


[docs] def WQXtoPandas( xmlLocation, charDict, outputPath=".", fromFile=False, outputDirName="Processed-Sites", RUN_PHREEQC=False, PHREEQC_PATH="/home/mcoving/phreeqc-2.18.0/bin/", DATABASE_FILE="/home/mcoving/phreeqc-2.18.0/database/phreeqc.dat", LOG_FILE="Result.log", START_FILE=None, splittag="", bracket_charge_balance=False, max_xml_query_tries=20, restart=False, ): """ Processes a WQX xml data file and loads data for each site in the WQX file into Pandas data objects that are stored in directories for each site. Parameters ---------- xmlLocation : string Content depends on mode in which WQXtoPandas is run. When fromFile is set to False (input methods 2 or 3 in excel file) this string contains the html for a query to the USGS NWIS database to obtain an xml file of the desired data. Alternatively, if fromFile is True (input method 1 in excel file) then this string contains the name of the xml file from which to read the data. charDict : dict A dictionary containing information about the characteristics to be processed. Keys are EPA SRS characteristic names. Each entry in the dictionary is a second dictionary that contains keys IsRequired, pcode, fraction, and quality. These entries tell WQXtoPandas whether a given characteristic is required in order to process a sample, and whether a specific pcode, fraction, or quality should be required. See excel example file for more details. outputPath : string path to directory that will contain output directory fromFile : boolean True if data will be read from an xml file already present on computer. False if xml file should be queried from NWIS. (Default=False) outputDirName : string Name of output directory where all site data will be written out. (Default='Processed-Sites') RUN_PHREEQC : boolean Set to true if samples should be processed through PHREEQC. (Default=False) PHREEQC_PATH : string Path to PHREEQC executable (folder only, not executable file name) DATABASE_FILE : string Path to database file that PHREEQC should use, including database file name. LOG_FILE : string Name of log file that WQXtoPandas will create. (Default='Result.log') START_FILE : string Name of xls start file that was used to run this instance of WQXtoPandas. Name will be written out in log file. bracket_charge_balance : bool If set to true, WQXtoPandas will alternately force charge balance on calcium and alkalinity, while the latter is not physically meaningful, this provides a useful estimate of uncertainty for cases with high charge balance errors. This is most useful for water that is very dilute or with high organic content, such that titrated alkalinity values are artificially high. max_xml_query_tries : int Maximum number of times to try to retreive an xml file using a query to the Water Quality Portal database. Default = 20. restart : bool Boolean to enable restarting failed run. If set to True, then the function will skip over any queries that already have an xml file created. Default = False. Returns ------- Returns 0 if execution successful. Returns -1 in case of error. Notes ----- Designed to be run through convenience function runWQXtoPandas(). """ try: # Check to see if output directory exists absOutputDirPath = os.path.abspath(outputPath) sitesdir = os.path.join(absOutputDirPath, outputDirName) print("sitesdir", sitesdir) if not (os.path.exists(sitesdir)): try: os.makedirs(sitesdir) except os.error: print( ( "Problem creating output directory. Check output path name: " + outputPath ) ) return -1 # create xml tree if fromFile: # read from file print("xmlLocation", xmlLocation) wqxtree = etree.ElementTree(file=xmlLocation) else: # check whether we already have a matching xml file xmlSaveFile = LOG_FILE + splittag + ".xml" if os.path.isfile(xmlSaveFile): if restart: print( "Skipping " + xmlSaveFile + " because it exists and we are running in restart mode." ) # If we are restarting a failed run. Skip existing xml files. return -1 goodAnswer = False while not (goodAnswer): answer = input( "An xml file (" + xmlSaveFile + ") already exists. \n Use this instead of html query (y or n)?" ) if answer.startswith("y"): # read from file wqxtree = etree.ElementTree(file=xmlSaveFile) goodAnswer = True queryXML = False elif answer.startswith("n"): goodAnswer = True queryXML = True else: queryXML = True # If we don't have a matching xml file, or we want to obtain a new one, then get the new xml if queryXML: gotXML = False ntries = 0 print("Obtaining xml file from USGS NWIS using html query...") # parse from html query while not gotXML and ntries <= max_xml_query_tries: if ntries > 0: print("Trying again: try number " + str(ntries)) print("XML query string: ", xmlLocation) r = requests.get(xmlLocation) if r.ok: gotXML = True else: # There is some problem with the xml query print("Response: ", str(r)) print("Reason: ", r.reason) if "warning" in r.headers: print("Warning: ", r.headers["warning"]) ntries += 1 if ntries > max_xml_query_tries: print( "Reached maximum number of tries. Stopping this query." ) return -1 print("Pausing for one minute and will retry...") time.sleep(60) try: # write xml to file xmlFile = open(xmlSaveFile, "w") print(r.text, file=xmlFile) xmlFile.close() wqxtree = etree.ElementTree(file=xmlSaveFile) except IOError: print( ( "Problem writing to xml file to store html query: " + xmlSaveFile ) ) return -1 # begin parsing XML tree root = wqxtree.getroot() # get namespace map NSMAP = root.nsmap WQX = "{%s}" % NSMAP[None] # iterate over all <Activity> tags within file and process each sample samples_processed = [] samples_not_processed = [] sitesDict = {} for activity in wqxtree.getiterator(tag=WQX + "Activity"): processThisSample = True reason = "" description = activity.find(WQX + "ActivityDescription") if description != None: datetext = description.findtext(WQX + "ActivityStartDate") starttime = description.find(WQX + "ActivityStartTime") if starttime != None: timetext = starttime.findtext(WQX + "Time") timezone = starttime.findtext(WQX + "TimeZoneCode") else: timetext = "" timezone = "" location = description.findtext(WQX + "MonitoringLocationIdentifier") if location[:5] == "USGS-": USGS = True else: USGS = False descriptionDict = { "location": location, "date": datetext, "time": timetext, "timezone": timezone, } else: descriptionDict = None processThisSample = False reason = "No description" print(("Processing sample from " + location + " on " + datetext)) # create null sample dict sampleDict = {} sampleMetaDict = {} # iterate though all results for this activity for result in activity.getiterator(tag=WQX + "Result"): if processThisSample: try: resultdesc = result.find(WQX + "ResultDescription") characteristic = resultdesc.findtext(WQX + "CharacteristicName") if characteristic in charDict: samplefraction = resultdesc.findtext( WQX + "ResultSampleFractionText" ) pcode = resultdesc.findtext(WQX + "USGSPCode") quality = resultdesc.findtext( WQX + "ResultStatusIdentifier" ) measure = resultdesc.find(WQX + "ResultMeasure") count = 1.0 detection = resultdesc.findtext( WQX + "ResultDetectionConditionText" ) # print('detection=',detection) if not (measure is None) or not (detection is None): if not (measure is None): value = measure.findtext(WQX + "ResultMeasureValue") # print('initial value = ',value) units = measure.findtext(WQX + "MeasureUnitCode") # EPA system does not have detection info. # Check for < in value text. if "<" in str(value): value = value[1:] nondetect = True else: nondetect = False elif not (detection is None): # print("entering nondetect...") nondetect = True value = None labinfo = result.find(WQX + "ResultLabInformation") if not (labinfo == None): # print("labinfo present") quantLimitMeasure = labinfo.find( WQX + "ResultDetectionQuantitationLimit" ) if not (quantLimitMeasure == None): # print("Quant limit present") nondetectmeasure = quantLimitMeasure.find( WQX + "DetectionQuantitationLimitMeasure" ) if not (nondetectmeasure == None): # print("Quant limit measure present") value = nondetectmeasure.findtext( WQX + "MeasureValue" ) # print('measurevalue=',value) # print(quantLimitMeasure) # print("nondetect value=",value) # split pcode into list tempPcodeList = charDict[characteristic]["pcode"].split( ";" ) # print("tempPcodeList="+str(tempPcodeList)) pcodeDict = {} for codePriority, code in enumerate(tempPcodeList): code = code.strip() if code != "": pcodeDict[code] = codePriority # Check whether characteristic meets criteria # for inclusion, otherwise don't add to sampleDict addCharacteristic = True if charDict[characteristic]["fraction"] != "0": # test for correct fraction if ( charDict[characteristic]["fraction"] != samplefraction ): addCharacteristic = False if addCharacteristic: if USGS: if charDict[characteristic]["pcode"] != "0": # test for correct pcode # print("pcode = "+pcode) # print("pcodeList = "+str(pcodeList)) # print("pcode in list="+str(pcode in pcodeList)) if not (pcode in pcodeDict): addCharacteristic = False if addCharacteristic: if charDict[characteristic]["quality"] != "0": # test for correct data quality if ( charDict[characteristic]["quality"] != quality ): addCharacteristic = False # end of characteristic criteria check # Process duplicate characteristics if addCharacteristic: if characteristic in sampleDict: if USGS: priorPcode = sampleMetaDict[characteristic][ "pcode" ] # if there are already multiple pcodes get only first one priorPcode = priorPcode.split(";")[0] averageValue = False if len(pcodeDict) > 1: thisPcodePriority = pcodeDict[pcode] priorPcodePriority = pcodeDict[ priorPcode ] if ( thisPcodePriority > priorPcodePriority ): # previous characteristic remains addCharacteristic = False elif ( thisPcodePriority == priorPcodePriority ): averageValue = True else: averageValue = True if averageValue: priorUnits = sampleMetaDict[ characteristic ]["units"] # Only average if we have the same units if units == priorUnits: # Check if this or prior was non-detect if sampleMetaDict[characteristic][ "nondetect" ]: if nondetect: # If both are non-detect, no need to add averageValue = False addCharacteristic = False else: # If prior was non-detect, but this one isn't # Add this one instead averageValue = False addCharacteristic = True elif nondetect: # This one is non-detect, prior was not. averageValue = False addCharacteristic = False if averageValue: # average this value with existing values count = sampleMetaDict[ characteristic ]["count"] count += 1.0 oldvalue = float( sampleDict[characteristic] ) newvalue = ( oldvalue * (count - 1.0) + float(value) ) / count value = str(newvalue) pcode = ( priorPcode + "; " + pcode ) # Changed this behavior to not allow different units # units = priorUnits + '; ' + units else: # Do not add if units are different addCharacteristic = False if addCharacteristic: sampleDict[characteristic] = value sampleMetaDict[characteristic] = { "samplefraction": samplefraction, "units": units, "pcode": pcode, "quality": quality, "count": count, "nondetect": nondetect, } except etree.XMLSyntaxError as detail: print("File contains invalid XML syntax: ", detail) processThisSample = False reason = "Entry contains invalid XML syntax." # end results loop # check whether sample has all the required constituents if processThisSample: for characteristic in charDict.keys(): if charDict[characteristic]["IsRequired"] != "0": if not (characteristic in sampleDict): processThisSample = False reason += characteristic + " not available. " if processThisSample: # check to see whether site directory exists, if not, create it sampledir = os.path.join(sitesdir, location) if not (os.path.exists(sampledir)): try: os.makedirs(sampledir) except os.error: print(("Problem creating location directory: " + sampledir)) processThisSample = False reason = "Problem creating location directory: " + sampledir if processThisSample: # Pull daily discharge data from USGS website good_discharge_value = False num_Q_tries = 0 if not USGS: # We do not have a USGS site, do not query discharge num_Q_tries = 99 dischargeDict = None # Try 5 times to retrieve discharge value while (not good_discharge_value) and num_Q_tries <= 5: dischargeDict = GetDailyDischarge( location, datetext ) # currently hard-wired to pcode 00060 (daily discharge, cfs) if dischargeDict != -1: good_discharge_value = True else: num_Q_tries += 1 dischargeDict = None if dischargeDict is not None: sampleDict["Stream flow, mean. daily"] = dischargeDict["discharge"] sampleMetaDict["Stream flow, mean. daily"] = { "units": "cfs", "pcode": "00060", "quality": dischargeDict["quality"], "count": 1, "samplefraction": None, "nondetect": False, } descriptionDict["name"] = dischargeDict["name"] else: # Possibly allow this sample to be thrown out if no mean daily discharge, and/or similar for instantaneous discharge sampleDict["Stream flow, mean. daily"] = None sampleMetaDict["Stream flow, mean. daily"] = { "units": "cfs", "pcode": "00060", "quality": None, "count": 1, "samplefraction": None, "nondetect": False, } # Create data frame row for this sample date if descriptionDict["time"] != "": rowdate = to_datetime(datetext + " " + descriptionDict["time"]) else: rowdate = to_datetime(datetext) # Create Multiindex Dataframe to contain sample meta data sampleMultiindexRow = concat( { "data": DataFrame(sampleDict, index=[rowdate], dtype="float"), "time": DataFrame( descriptionDict["time"], index=[rowdate], columns=list(sampleMetaDict.keys()), ), "timezone": DataFrame( descriptionDict["timezone"], index=[rowdate], columns=list(sampleMetaDict.keys()), ), "pcode": DataFrame( [extractValues(sampleMetaDict, ["pcode"])["values"]], index=[rowdate], columns=list(sampleMetaDict.keys()), ), "quality": DataFrame( [extractValues(sampleMetaDict, ["quality"])["values"]], index=[rowdate], columns=list(sampleMetaDict.keys()), ), "fraction": DataFrame( [ extractValues(sampleMetaDict, ["samplefraction"])[ "values" ] ], index=[rowdate], columns=list(sampleMetaDict.keys()), ), "units": DataFrame( [extractValues(sampleMetaDict, ["units"])["values"]], index=[rowdate], columns=list(sampleMetaDict.keys()), ), "count": DataFrame( [extractValues(sampleMetaDict, ["count"])["values"]], index=[rowdate], columns=list(sampleMetaDict.keys()), ), "nondetect": DataFrame( [extractValues(sampleMetaDict, ["nondetect"])["values"]], index=[rowdate], columns=list(sampleMetaDict.keys()), ), }, axis=1, ) # sampleMetaRow = Series(sampleMetaDict, index=[to_datetime(datetext)], dtype='object') # Previous solution was reading/writing from pickle files # New solution will keep all data in memory until end. # This could cause memory problems with large data sets # Test whether a df for this location already exists if location in sitesDict: # tempDF = sitesDict[location] # sitesDict[location] = tempDF.append(sampleRow) tempMultiindex = sitesDict[location] sitesDict[location] = concat( [tempMultiindex, sampleMultiindexRow], axis=0 ) else: sitesDict[location] = sampleMultiindexRow # add one to number of samples processed if processThisSample: samples_processed.append(location + " " + datetext) else: samples_not_processed.append(location + " " + datetext + " - " + reason) print(("Number of Samples Processed = " + str(len(samples_processed)))) print(("Number of Samples Not Processed = " + str(len(samples_not_processed)))) # Write out individual site data pickle and csv files in each site directory print("Writing out site data files...") for location, midf in sitesDict.items(): print(location) pickleFile = os.path.join(sitesdir, location, location + "-Dataframe.pkl") pickle.dump(midf, open(pickleFile, "wb")) midx = midf.keys() with ExcelWriter(pickleFile[:-3] + "xlsx") as writer: for sheet in midx.droplevel(level=1).drop_duplicates().values: midf[sheet].to_excel(writer, sheet_name=sheet) # Retrieve and store site description metadata siteDescriptionDataDF = GetSiteData(location) siteDescriptionDataFileName = os.path.join( sitesdir, location, location + "-Site-Description.pkl" ) pickle.dump(siteDescriptionDataDF, open(siteDescriptionDataFileName, "wb")) siteDescriptionDataDF.to_csv(siteDescriptionDataFileName[:-3] + "csv") # Process sites through PHREEQC if RUN_PHREEQC: print("Processing site water chemisty data in PHREEQC...") for location, midf in sitesDict.items(): phreeqc_df = processMidf( midf, os.path.join(sitesdir, location), PHREEQC_PATH, DATABASE_FILE ) phreeqc_site_file = os.path.join( sitesdir, location, location + "-PHREEQC.pkl" ) try: pickle.dump(phreeqc_df, open(phreeqc_site_file, "wb")) phreeqc_df.to_csv(phreeqc_site_file[:-3] + "csv") except IOError: print("Problem writing out PHREEQC data file.") if bracket_charge_balance: for location, midf in sitesDict.items(): # Force balance on Calcium phreeqc_df_ca = processMidf( midf, os.path.join(sitesdir, location), PHREEQC_PATH, DATABASE_FILE, force_balance="Ca", ) phreeqc_site_file_ca = os.path.join( sitesdir, location, location + "-PHREEQC-Ca.pkl" ) try: pickle.dump(phreeqc_df_ca, open(phreeqc_site_file_ca, "wb")) phreeqc_df_ca.to_csv(phreeqc_site_file_ca[:-3] + "csv") except IOError: print("Problem writing out PHREEQC Ca data file.") # Force balance on Alkalinity phreeqc_df_alk = processMidf( midf, os.path.join(sitesdir, location), PHREEQC_PATH, DATABASE_FILE, force_balance="Alk", ) phreeqc_site_file_alk = os.path.join( sitesdir, location, location + "-PHREEQC-Alk.pkl" ) try: pickle.dump(phreeqc_df_alk, open(phreeqc_site_file_alk, "wb")) phreeqc_df_alk.to_csv(phreeqc_site_file_alk[:-3] + "csv") except IOError: print("Problem writing out PHREEQC Alk data file.") # Create log file print(("Writing log file: " + LOG_FILE + splittag)) try: log_file = open(LOG_FILE + splittag, "w") print("Start file = " + START_FILE, file=log_file) print( "Number of Samples Processed = " + str(len(samples_processed)), file=log_file, ) print( "Number of Samples Not Processed = " + str(len(samples_not_processed)), file=log_file, ) print("###############", file=log_file) print("Characteristics", file=log_file) print("###############", file=log_file) printColumnNames = True for key, flags in charDict.items(): if printColumnNames: names = ["characteristic"] # + '\t' for column in flags.keys(): names.append(str(column)) print(str("\t".join(names)), file=log_file) printColumnNames = False columns = [key] for column in flags.keys(): if isinstance(flags[column], str): columns.append(flags[column]) print(str("\t".join(columns)), file=log_file) print("###############", file=log_file) print("Samples processed", file=log_file) print("###############", file=log_file) for line in samples_processed: print(line, file=log_file) print("###############", file=log_file) print("Samples not processed", file=log_file) print("###############", file=log_file) for line in samples_not_processed: print(line, file=log_file) except IOError: print(("Problem opening log file: " + LOG_FILE)) return -1 # exceptions for parsing of xml file except IOError: print("Error opening xml file. Does it exist?") # Note: can throw this error when discharge values are not read correctly, # I should fix this, 6/16/2014 except etree.XMLSyntaxError as detail: print("File contains invalid XML syntax: ", detail) except requests.exceptions.RequestException as detail: print("Error retrieving data by xml query: ", detail) return 0
[docs] def runWQXtoPandas(startfilename, autosplitnum=20, restart=False): """ Runs WQXtoPandas on an excel format input file where parameters can be set for an automatic query of data from the USGS NWIS database. Parameters ---------- startfilename : string A string containing the name of the excel file to be used for input parameters to WQXtoPandas autosplitnum : int (optional) The number of sites at which a NWIS query is split into multiple queries. (default=20) restart : bool Whether we are restarting a failed run and want to skip existing xmls. Default = False. Returns ------- None Notes ----- Can be run from within a python shell or script, or as a standalone script from the command line where the start file name is provided as the first command line argument (e.g. WQXtoPandas <start file name> <autosplitnum>). """ # PHREEQC input file path PHREEQC_INPUT_PATH = "./" num_samples = 0 num_processed = 0 if not (type(autosplitnum) == int): print("autosplitnum must be an integer.") return -1 print(("Processing: " + startfilename)) try: # open start file startfile = xlrd.open_workbook(startfilename) # open sheet sheet = startfile.sheet_by_index(0) # parse start file to determine what should be done characteristicsBlockStarted = False settingsDict = {} charDict = {} for rownum in range(sheet.nrows): line = sheet.row_values(rownum) if not (line[0][0] == "#"): # ignore comments if not (characteristicsBlockStarted): # read script settings if not (line[0] == "Characteristic"): settingsDict[line[0]] = line[1] else: # grab the characteristic block column headings column_headings = line[1:] characteristicsBlockStarted = True else: # we are in the characteristics block charDict[line[0]] = dict(list(zip(column_headings, line[1:]))) DATABASE_FILE = os.path.join( settingsDict["Path to chemical database"], settingsDict["Name of chemical database"], ) LOG_FILE = os.path.join( settingsDict["Path to output directory"], settingsDict["Name of output directory"], settingsDict["Log file name"], ) RUN_PHREEQC = settingsDict["Run PHREEQC?"] == "Yes" bracket_charge_balance = settingsDict["Force balance on Ca and Alk"] == "Yes" if settingsDict["Input method"] == "1": # We already have an XML file to process that contains water quality data # Check whether a wildcard was used and more than one xml file is available xml_file_string = os.path.join( settingsDict["Path to output directory"], settingsDict["Name of output directory"], settingsDict["Input file"], ) xml_list = glob(xml_file_string) if xml_list == []: print("Empty xml file list. Check path for xml file.") print("xml file string =", xml_file_string) return -1 n_xml = len(xml_list) if n_xml > 1: for xml_file in xml_list: WQXtoPandas( xml_file, charDict, outputPath=settingsDict["Path to output directory"], outputDirName=settingsDict["Name of output directory"], fromFile=True, RUN_PHREEQC=RUN_PHREEQC, bracket_charge_balance=bracket_charge_balance, PHREEQC_PATH=settingsDict["Path to PHREEQC"], DATABASE_FILE=DATABASE_FILE, LOG_FILE=LOG_FILE, START_FILE=startfilename, ) else: WQXtoPandas( # settingsDict["Input file"], # appears to be a path bug xml_list[0], charDict, outputPath=settingsDict["Path to output directory"], outputDirName=settingsDict["Name of output directory"], fromFile=True, RUN_PHREEQC=RUN_PHREEQC, bracket_charge_balance=bracket_charge_balance, PHREEQC_PATH=settingsDict["Path to PHREEQC"], DATABASE_FILE=DATABASE_FILE, LOG_FILE=LOG_FILE, START_FILE=startfilename, ) elif settingsDict["Input method"] == "2": # We will use a list of sites from a NWIS XML file and query these # sites for water quality data # First extract site list from XML file try: siteList = extractSitesFromXML(settingsDict["Input file"]) except IOError: print( "Problem extracting sites from XML file " + settingsDict["Input file"] + " check to see if file name is correct and file is in right location." ) return -1 charList = [] # collect list of characteristics to query for key in charDict.keys(): charList.append(str(key)) if len(siteList) > autosplitnum: # We have too long of a list and should split into multiple queries n_groups = int(ceil(len(siteList) / float(autosplitnum))) for i in range(n_groups): # this doesn't work for even division cases shortList = siteList[ i * autosplitnum : i * autosplitnum + autosplitnum ] queryText = querySiteList(shortList, charList) if queryText != None: WQXtoPandas( queryText, charDict, outputPath=settingsDict["Path to output directory"], outputDirName=settingsDict["Name of output directory"], fromFile=False, RUN_PHREEQC=RUN_PHREEQC, bracket_charge_balance=bracket_charge_balance, PHREEQC_PATH=settingsDict["Path to PHREEQC"], DATABASE_FILE=DATABASE_FILE, splittag="." + str(i), LOG_FILE=LOG_FILE, START_FILE=startfilename, restart=restart, ) else: # get html for query queryText = querySiteList(siteList, charList) if queryText != None: WQXtoPandas( queryText, charDict, outputPath=settingsDict["Path to output directory"], outputDirName=settingsDict["Name of output directory"], fromFile=False, RUN_PHREEQC=RUN_PHREEQC, bracket_charge_balance=bracket_charge_balance, PHREEQC_PATH=settingsDict["Path to PHREEQC"], DATABASE_FILE=DATABASE_FILE, LOG_FILE=LOG_FILE, START_FILE=startfilename, ) elif settingsDict["Input method"] == "3": # We will use a list of sites from a text file and query these # sites for water quality data # First extract site list from text file try: siteList = extractSitesFromText(settingsDict["Input file"]) except IOError: print( "Problem extracting sites from text file " + settingsDict["Input file"] + " check to see if file name is correct and file is in right location." ) return -1 if siteList != -1: charList = [] # collect list of characteristics to query for key in charDict.keys(): charList.append(str(key)) if len(siteList) > autosplitnum: # We have too long of a list and should split into multiple queries n_groups = int(ceil(len(siteList) / float(autosplitnum))) for i in range(n_groups): shortList = siteList[ i * autosplitnum : i * autosplitnum + autosplitnum ] queryText = querySiteList(shortList, charList) if queryText != None: WQXtoPandas( queryText, charDict, outputPath=settingsDict["Path to output directory"], outputDirName=settingsDict["Name of output directory"], fromFile=False, RUN_PHREEQC=RUN_PHREEQC, bracket_charge_balance=bracket_charge_balance, PHREEQC_PATH=settingsDict["Path to PHREEQC"], splittag="." + str(i), DATABASE_FILE=DATABASE_FILE, LOG_FILE=LOG_FILE, START_FILE=startfilename, restart=restart, ) else: # get html for query queryText = querySiteList(siteList, charList) if queryText != None: WQXtoPandas( queryText, charDict, outputPath=settingsDict["Path to output directory"], outputDirName=settingsDict["Name of output directory"], fromFile=False, RUN_PHREEQC=RUN_PHREEQC, bracket_charge_balance=bracket_charge_balance, PHREEQC_PATH=settingsDict["Path to PHREEQC"], DATABASE_FILE=DATABASE_FILE, LOG_FILE=LOG_FILE, START_FILE=startfilename, ) else: print("Problem obtaining site list.") else: print( ( 'Problem with "Input Method" of start file: ' + settingsDict["Input method"] ) ) except IOError: print("Problem reading start file. Check file name.")
# Run as script if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-r", "--restart", action="store_true") parser.add_argument("-a", "--autosplitnum") parser.add_argument("startfilename") args = parser.parse_args() startfilename = args.startfilename if args.autosplitnum is not None: runWQXtoPandas( startfilename, autosplitnum=args.autosplitnum, restart=args.restart ) else: runWQXtoPandas(startfilename, restart=args.restart)