#! /usr/bin/env python
"""
This script takes an input file in USGS/EPA WQX xml format and creates a multi-indexed
Pandas Dataframe that contains time series of water quality data and discharge combined
with layers that contain meta data for each data value. You can call the script from
the command line using WQXtoPandas [input WQX start file], or import the runWQXtoPandas
function for calling within a Python session.
"""
from __future__ import print_function
import sys
import xlrd
import os
import time
import argparse
import requests
from glob import glob
from math import ceil
import pickle as pickle
from lxml import etree
from pandas import DataFrame, to_datetime, concat, ExcelWriter
from olm.USGS.PhreeqcPandas import processMidf
# import functions from olm package
from olm.USGS.siteListExtraction import extractSitesFromXML
from olm.USGS.siteListExtraction import extractSitesFromText
from olm.USGS.DataRetrieval import querySiteList, GetDailyDischarge, GetSiteData
from olm.USGS.dataSlice import extractValues
[docs]
def WQXtoPandas(
xmlLocation,
charDict,
outputPath=".",
fromFile=False,
outputDirName="Processed-Sites",
RUN_PHREEQC=False,
PHREEQC_PATH="/home/mcoving/phreeqc-2.18.0/bin/",
DATABASE_FILE="/home/mcoving/phreeqc-2.18.0/database/phreeqc.dat",
LOG_FILE="Result.log",
START_FILE=None,
splittag="",
bracket_charge_balance=False,
max_xml_query_tries=20,
restart=False,
):
"""
Processes a WQX xml data file and loads data for each site in the WQX file into
Pandas data objects that are stored in directories for each site.
Parameters
----------
xmlLocation : string
Content depends on mode in which WQXtoPandas is run. When fromFile is set to
False (input methods 2 or 3 in excel file) this string contains the html for
a query to the USGS NWIS database to obtain an xml file of the desired data.
Alternatively, if fromFile is True (input method 1 in excel file) then this
string contains the name of the xml file from which to read the data.
charDict : dict
A dictionary containing information about the characteristics to be processed.
Keys are EPA SRS characteristic names. Each entry in the dictionary is a second
dictionary that contains keys IsRequired, pcode, fraction, and quality. These
entries tell WQXtoPandas whether a given characteristic is required in order to
process a sample, and whether a specific pcode, fraction, or quality should be
required. See excel example file for more details.
outputPath : string
path to directory that will contain output directory
fromFile : boolean
True if data will be read from an xml file already present on computer. False
if xml file should be queried from NWIS. (Default=False)
outputDirName : string
Name of output directory where all site data will be written out.
(Default='Processed-Sites')
RUN_PHREEQC : boolean
Set to true if samples should be processed through PHREEQC. (Default=False)
PHREEQC_PATH : string
Path to PHREEQC executable (folder only, not executable file name)
DATABASE_FILE : string
Path to database file that PHREEQC should use, including database file name.
LOG_FILE : string
Name of log file that WQXtoPandas will create. (Default='Result.log')
START_FILE : string
Name of xls start file that was used to run this instance of WQXtoPandas. Name
will be written out in log file.
bracket_charge_balance : bool
If set to true, WQXtoPandas will alternately force charge balance on calcium and
alkalinity, while the latter is not physically meaningful, this provides a useful
estimate of uncertainty for cases with high charge balance errors. This is most
useful for water that is very dilute or with high organic content, such that
titrated alkalinity values are artificially high.
max_xml_query_tries : int
Maximum number of times to try to retreive an xml file using a query to the Water
Quality Portal database. Default = 20.
restart : bool
Boolean to enable restarting failed run. If set to True, then the function will
skip over any queries that already have an xml file created. Default = False.
Returns
-------
Returns 0 if execution successful. Returns -1 in case of error.
Notes
-----
Designed to be run through convenience function runWQXtoPandas().
"""
try:
# Check to see if output directory exists
absOutputDirPath = os.path.abspath(outputPath)
sitesdir = os.path.join(absOutputDirPath, outputDirName)
print("sitesdir", sitesdir)
if not (os.path.exists(sitesdir)):
try:
os.makedirs(sitesdir)
except os.error:
print(
(
"Problem creating output directory. Check output path name: "
+ outputPath
)
)
return -1
# create xml tree
if fromFile:
# read from file
print("xmlLocation", xmlLocation)
wqxtree = etree.ElementTree(file=xmlLocation)
else:
# check whether we already have a matching xml file
xmlSaveFile = LOG_FILE + splittag + ".xml"
if os.path.isfile(xmlSaveFile):
if restart:
print(
"Skipping "
+ xmlSaveFile
+ " because it exists and we are running in restart mode."
)
# If we are restarting a failed run. Skip existing xml files.
return -1
goodAnswer = False
while not (goodAnswer):
answer = input(
"An xml file ("
+ xmlSaveFile
+ ") already exists. \n Use this instead of html query (y or n)?"
)
if answer.startswith("y"):
# read from file
wqxtree = etree.ElementTree(file=xmlSaveFile)
goodAnswer = True
queryXML = False
elif answer.startswith("n"):
goodAnswer = True
queryXML = True
else:
queryXML = True
# If we don't have a matching xml file, or we want to obtain a new one, then get the new xml
if queryXML:
gotXML = False
ntries = 0
print("Obtaining xml file from USGS NWIS using html query...")
# parse from html query
while not gotXML and ntries <= max_xml_query_tries:
if ntries > 0:
print("Trying again: try number " + str(ntries))
print("XML query string: ", xmlLocation)
r = requests.get(xmlLocation)
if r.ok:
gotXML = True
else:
# There is some problem with the xml query
print("Response: ", str(r))
print("Reason: ", r.reason)
if "warning" in r.headers:
print("Warning: ", r.headers["warning"])
ntries += 1
if ntries > max_xml_query_tries:
print(
"Reached maximum number of tries. Stopping this query."
)
return -1
print("Pausing for one minute and will retry...")
time.sleep(60)
try:
# write xml to file
xmlFile = open(xmlSaveFile, "w")
print(r.text, file=xmlFile)
xmlFile.close()
wqxtree = etree.ElementTree(file=xmlSaveFile)
except IOError:
print(
(
"Problem writing to xml file to store html query: "
+ xmlSaveFile
)
)
return -1
# begin parsing XML tree
root = wqxtree.getroot()
# get namespace map
NSMAP = root.nsmap
WQX = "{%s}" % NSMAP[None]
# iterate over all <Activity> tags within file and process each sample
samples_processed = []
samples_not_processed = []
sitesDict = {}
for activity in wqxtree.getiterator(tag=WQX + "Activity"):
processThisSample = True
reason = ""
description = activity.find(WQX + "ActivityDescription")
if description != None:
datetext = description.findtext(WQX + "ActivityStartDate")
starttime = description.find(WQX + "ActivityStartTime")
if starttime != None:
timetext = starttime.findtext(WQX + "Time")
timezone = starttime.findtext(WQX + "TimeZoneCode")
else:
timetext = ""
timezone = ""
location = description.findtext(WQX + "MonitoringLocationIdentifier")
if location[:5] == "USGS-":
USGS = True
else:
USGS = False
descriptionDict = {
"location": location,
"date": datetext,
"time": timetext,
"timezone": timezone,
}
else:
descriptionDict = None
processThisSample = False
reason = "No description"
print(("Processing sample from " + location + " on " + datetext))
# create null sample dict
sampleDict = {}
sampleMetaDict = {}
# iterate though all results for this activity
for result in activity.getiterator(tag=WQX + "Result"):
if processThisSample:
try:
resultdesc = result.find(WQX + "ResultDescription")
characteristic = resultdesc.findtext(WQX + "CharacteristicName")
if characteristic in charDict:
samplefraction = resultdesc.findtext(
WQX + "ResultSampleFractionText"
)
pcode = resultdesc.findtext(WQX + "USGSPCode")
quality = resultdesc.findtext(
WQX + "ResultStatusIdentifier"
)
measure = resultdesc.find(WQX + "ResultMeasure")
count = 1.0
detection = resultdesc.findtext(
WQX + "ResultDetectionConditionText"
)
# print('detection=',detection)
if not (measure is None) or not (detection is None):
if not (measure is None):
value = measure.findtext(WQX + "ResultMeasureValue")
# print('initial value = ',value)
units = measure.findtext(WQX + "MeasureUnitCode")
# EPA system does not have detection info.
# Check for < in value text.
if "<" in str(value):
value = value[1:]
nondetect = True
else:
nondetect = False
elif not (detection is None):
# print("entering nondetect...")
nondetect = True
value = None
labinfo = result.find(WQX + "ResultLabInformation")
if not (labinfo == None):
# print("labinfo present")
quantLimitMeasure = labinfo.find(
WQX + "ResultDetectionQuantitationLimit"
)
if not (quantLimitMeasure == None):
# print("Quant limit present")
nondetectmeasure = quantLimitMeasure.find(
WQX
+ "DetectionQuantitationLimitMeasure"
)
if not (nondetectmeasure == None):
# print("Quant limit measure present")
value = nondetectmeasure.findtext(
WQX + "MeasureValue"
)
# print('measurevalue=',value)
# print(quantLimitMeasure)
# print("nondetect value=",value)
# split pcode into list
tempPcodeList = charDict[characteristic]["pcode"].split(
";"
)
# print("tempPcodeList="+str(tempPcodeList))
pcodeDict = {}
for codePriority, code in enumerate(tempPcodeList):
code = code.strip()
if code != "":
pcodeDict[code] = codePriority
# Check whether characteristic meets criteria
# for inclusion, otherwise don't add to sampleDict
addCharacteristic = True
if charDict[characteristic]["fraction"] != "0":
# test for correct fraction
if (
charDict[characteristic]["fraction"]
!= samplefraction
):
addCharacteristic = False
if addCharacteristic:
if USGS:
if charDict[characteristic]["pcode"] != "0":
# test for correct pcode
# print("pcode = "+pcode)
# print("pcodeList = "+str(pcodeList))
# print("pcode in list="+str(pcode in pcodeList))
if not (pcode in pcodeDict):
addCharacteristic = False
if addCharacteristic:
if charDict[characteristic]["quality"] != "0":
# test for correct data quality
if (
charDict[characteristic]["quality"]
!= quality
):
addCharacteristic = False
# end of characteristic criteria check
# Process duplicate characteristics
if addCharacteristic:
if characteristic in sampleDict:
if USGS:
priorPcode = sampleMetaDict[characteristic][
"pcode"
]
# if there are already multiple pcodes get only first one
priorPcode = priorPcode.split(";")[0]
averageValue = False
if len(pcodeDict) > 1:
thisPcodePriority = pcodeDict[pcode]
priorPcodePriority = pcodeDict[
priorPcode
]
if (
thisPcodePriority
> priorPcodePriority
):
# previous characteristic remains
addCharacteristic = False
elif (
thisPcodePriority
== priorPcodePriority
):
averageValue = True
else:
averageValue = True
if averageValue:
priorUnits = sampleMetaDict[
characteristic
]["units"]
# Only average if we have the same units
if units == priorUnits:
# Check if this or prior was non-detect
if sampleMetaDict[characteristic][
"nondetect"
]:
if nondetect:
# If both are non-detect, no need to add
averageValue = False
addCharacteristic = False
else:
# If prior was non-detect, but this one isn't
# Add this one instead
averageValue = False
addCharacteristic = True
elif nondetect:
# This one is non-detect, prior was not.
averageValue = False
addCharacteristic = False
if averageValue:
# average this value with existing values
count = sampleMetaDict[
characteristic
]["count"]
count += 1.0
oldvalue = float(
sampleDict[characteristic]
)
newvalue = (
oldvalue * (count - 1.0)
+ float(value)
) / count
value = str(newvalue)
pcode = (
priorPcode + "; " + pcode
)
# Changed this behavior to not allow different units
# units = priorUnits + '; ' + units
else:
# Do not add if units are different
addCharacteristic = False
if addCharacteristic:
sampleDict[characteristic] = value
sampleMetaDict[characteristic] = {
"samplefraction": samplefraction,
"units": units,
"pcode": pcode,
"quality": quality,
"count": count,
"nondetect": nondetect,
}
except etree.XMLSyntaxError as detail:
print("File contains invalid XML syntax: ", detail)
processThisSample = False
reason = "Entry contains invalid XML syntax."
# end results loop
# check whether sample has all the required constituents
if processThisSample:
for characteristic in charDict.keys():
if charDict[characteristic]["IsRequired"] != "0":
if not (characteristic in sampleDict):
processThisSample = False
reason += characteristic + " not available. "
if processThisSample:
# check to see whether site directory exists, if not, create it
sampledir = os.path.join(sitesdir, location)
if not (os.path.exists(sampledir)):
try:
os.makedirs(sampledir)
except os.error:
print(("Problem creating location directory: " + sampledir))
processThisSample = False
reason = "Problem creating location directory: " + sampledir
if processThisSample:
# Pull daily discharge data from USGS website
good_discharge_value = False
num_Q_tries = 0
if not USGS:
# We do not have a USGS site, do not query discharge
num_Q_tries = 99
dischargeDict = None
# Try 5 times to retrieve discharge value
while (not good_discharge_value) and num_Q_tries <= 5:
dischargeDict = GetDailyDischarge(
location, datetext
) # currently hard-wired to pcode 00060 (daily discharge, cfs)
if dischargeDict != -1:
good_discharge_value = True
else:
num_Q_tries += 1
dischargeDict = None
if dischargeDict is not None:
sampleDict["Stream flow, mean. daily"] = dischargeDict["discharge"]
sampleMetaDict["Stream flow, mean. daily"] = {
"units": "cfs",
"pcode": "00060",
"quality": dischargeDict["quality"],
"count": 1,
"samplefraction": None,
"nondetect": False,
}
descriptionDict["name"] = dischargeDict["name"]
else:
# Possibly allow this sample to be thrown out if no mean daily discharge, and/or similar for instantaneous discharge
sampleDict["Stream flow, mean. daily"] = None
sampleMetaDict["Stream flow, mean. daily"] = {
"units": "cfs",
"pcode": "00060",
"quality": None,
"count": 1,
"samplefraction": None,
"nondetect": False,
}
# Create data frame row for this sample date
if descriptionDict["time"] != "":
rowdate = to_datetime(datetext + " " + descriptionDict["time"])
else:
rowdate = to_datetime(datetext)
# Create Multiindex Dataframe to contain sample meta data
sampleMultiindexRow = concat(
{
"data": DataFrame(sampleDict, index=[rowdate], dtype="float"),
"time": DataFrame(
descriptionDict["time"],
index=[rowdate],
columns=list(sampleMetaDict.keys()),
),
"timezone": DataFrame(
descriptionDict["timezone"],
index=[rowdate],
columns=list(sampleMetaDict.keys()),
),
"pcode": DataFrame(
[extractValues(sampleMetaDict, ["pcode"])["values"]],
index=[rowdate],
columns=list(sampleMetaDict.keys()),
),
"quality": DataFrame(
[extractValues(sampleMetaDict, ["quality"])["values"]],
index=[rowdate],
columns=list(sampleMetaDict.keys()),
),
"fraction": DataFrame(
[
extractValues(sampleMetaDict, ["samplefraction"])[
"values"
]
],
index=[rowdate],
columns=list(sampleMetaDict.keys()),
),
"units": DataFrame(
[extractValues(sampleMetaDict, ["units"])["values"]],
index=[rowdate],
columns=list(sampleMetaDict.keys()),
),
"count": DataFrame(
[extractValues(sampleMetaDict, ["count"])["values"]],
index=[rowdate],
columns=list(sampleMetaDict.keys()),
),
"nondetect": DataFrame(
[extractValues(sampleMetaDict, ["nondetect"])["values"]],
index=[rowdate],
columns=list(sampleMetaDict.keys()),
),
},
axis=1,
)
# sampleMetaRow = Series(sampleMetaDict, index=[to_datetime(datetext)], dtype='object')
# Previous solution was reading/writing from pickle files
# New solution will keep all data in memory until end.
# This could cause memory problems with large data sets
# Test whether a df for this location already exists
if location in sitesDict:
# tempDF = sitesDict[location]
# sitesDict[location] = tempDF.append(sampleRow)
tempMultiindex = sitesDict[location]
sitesDict[location] = concat(
[tempMultiindex, sampleMultiindexRow], axis=0
)
else:
sitesDict[location] = sampleMultiindexRow
# add one to number of samples processed
if processThisSample:
samples_processed.append(location + " " + datetext)
else:
samples_not_processed.append(location + " " + datetext + " - " + reason)
print(("Number of Samples Processed = " + str(len(samples_processed))))
print(("Number of Samples Not Processed = " + str(len(samples_not_processed))))
# Write out individual site data pickle and csv files in each site directory
print("Writing out site data files...")
for location, midf in sitesDict.items():
print(location)
pickleFile = os.path.join(sitesdir, location, location + "-Dataframe.pkl")
pickle.dump(midf, open(pickleFile, "wb"))
midx = midf.keys()
with ExcelWriter(pickleFile[:-3] + "xlsx") as writer:
for sheet in midx.droplevel(level=1).drop_duplicates().values:
midf[sheet].to_excel(writer, sheet_name=sheet)
# Retrieve and store site description metadata
siteDescriptionDataDF = GetSiteData(location)
siteDescriptionDataFileName = os.path.join(
sitesdir, location, location + "-Site-Description.pkl"
)
pickle.dump(siteDescriptionDataDF, open(siteDescriptionDataFileName, "wb"))
siteDescriptionDataDF.to_csv(siteDescriptionDataFileName[:-3] + "csv")
# Process sites through PHREEQC
if RUN_PHREEQC:
print("Processing site water chemisty data in PHREEQC...")
for location, midf in sitesDict.items():
phreeqc_df = processMidf(
midf, os.path.join(sitesdir, location), PHREEQC_PATH, DATABASE_FILE
)
phreeqc_site_file = os.path.join(
sitesdir, location, location + "-PHREEQC.pkl"
)
try:
pickle.dump(phreeqc_df, open(phreeqc_site_file, "wb"))
phreeqc_df.to_csv(phreeqc_site_file[:-3] + "csv")
except IOError:
print("Problem writing out PHREEQC data file.")
if bracket_charge_balance:
for location, midf in sitesDict.items():
# Force balance on Calcium
phreeqc_df_ca = processMidf(
midf,
os.path.join(sitesdir, location),
PHREEQC_PATH,
DATABASE_FILE,
force_balance="Ca",
)
phreeqc_site_file_ca = os.path.join(
sitesdir, location, location + "-PHREEQC-Ca.pkl"
)
try:
pickle.dump(phreeqc_df_ca, open(phreeqc_site_file_ca, "wb"))
phreeqc_df_ca.to_csv(phreeqc_site_file_ca[:-3] + "csv")
except IOError:
print("Problem writing out PHREEQC Ca data file.")
# Force balance on Alkalinity
phreeqc_df_alk = processMidf(
midf,
os.path.join(sitesdir, location),
PHREEQC_PATH,
DATABASE_FILE,
force_balance="Alk",
)
phreeqc_site_file_alk = os.path.join(
sitesdir, location, location + "-PHREEQC-Alk.pkl"
)
try:
pickle.dump(phreeqc_df_alk, open(phreeqc_site_file_alk, "wb"))
phreeqc_df_alk.to_csv(phreeqc_site_file_alk[:-3] + "csv")
except IOError:
print("Problem writing out PHREEQC Alk data file.")
# Create log file
print(("Writing log file: " + LOG_FILE + splittag))
try:
log_file = open(LOG_FILE + splittag, "w")
print("Start file = " + START_FILE, file=log_file)
print(
"Number of Samples Processed = " + str(len(samples_processed)),
file=log_file,
)
print(
"Number of Samples Not Processed = " + str(len(samples_not_processed)),
file=log_file,
)
print("###############", file=log_file)
print("Characteristics", file=log_file)
print("###############", file=log_file)
printColumnNames = True
for key, flags in charDict.items():
if printColumnNames:
names = ["characteristic"] # + '\t'
for column in flags.keys():
names.append(str(column))
print(str("\t".join(names)), file=log_file)
printColumnNames = False
columns = [key]
for column in flags.keys():
if isinstance(flags[column], str):
columns.append(flags[column])
print(str("\t".join(columns)), file=log_file)
print("###############", file=log_file)
print("Samples processed", file=log_file)
print("###############", file=log_file)
for line in samples_processed:
print(line, file=log_file)
print("###############", file=log_file)
print("Samples not processed", file=log_file)
print("###############", file=log_file)
for line in samples_not_processed:
print(line, file=log_file)
except IOError:
print(("Problem opening log file: " + LOG_FILE))
return -1
# exceptions for parsing of xml file
except IOError:
print("Error opening xml file. Does it exist?")
# Note: can throw this error when discharge values are not read correctly,
# I should fix this, 6/16/2014
except etree.XMLSyntaxError as detail:
print("File contains invalid XML syntax: ", detail)
except requests.exceptions.RequestException as detail:
print("Error retrieving data by xml query: ", detail)
return 0
[docs]
def runWQXtoPandas(startfilename, autosplitnum=20, restart=False):
"""
Runs WQXtoPandas on an excel format input file where parameters can be set for an automatic query of data from
the USGS NWIS database.
Parameters
----------
startfilename : string
A string containing the name of the excel file to be used for input parameters to WQXtoPandas
autosplitnum : int (optional)
The number of sites at which a NWIS query is split into multiple queries. (default=20)
restart : bool
Whether we are restarting a failed run and want to skip existing xmls. Default = False.
Returns
-------
None
Notes
-----
Can be run from within a python shell or script, or as a standalone script from the command line where the start
file name is provided as the first command line argument (e.g. WQXtoPandas <start file name> <autosplitnum>).
"""
# PHREEQC input file path
PHREEQC_INPUT_PATH = "./"
num_samples = 0
num_processed = 0
if not (type(autosplitnum) == int):
print("autosplitnum must be an integer.")
return -1
print(("Processing: " + startfilename))
try:
# open start file
startfile = xlrd.open_workbook(startfilename)
# open sheet
sheet = startfile.sheet_by_index(0)
# parse start file to determine what should be done
characteristicsBlockStarted = False
settingsDict = {}
charDict = {}
for rownum in range(sheet.nrows):
line = sheet.row_values(rownum)
if not (line[0][0] == "#"): # ignore comments
if not (characteristicsBlockStarted): # read script settings
if not (line[0] == "Characteristic"):
settingsDict[line[0]] = line[1]
else: # grab the characteristic block column headings
column_headings = line[1:]
characteristicsBlockStarted = True
else: # we are in the characteristics block
charDict[line[0]] = dict(list(zip(column_headings, line[1:])))
DATABASE_FILE = os.path.join(
settingsDict["Path to chemical database"],
settingsDict["Name of chemical database"],
)
LOG_FILE = os.path.join(
settingsDict["Path to output directory"],
settingsDict["Name of output directory"],
settingsDict["Log file name"],
)
RUN_PHREEQC = settingsDict["Run PHREEQC?"] == "Yes"
bracket_charge_balance = settingsDict["Force balance on Ca and Alk"] == "Yes"
if settingsDict["Input method"] == "1":
# We already have an XML file to process that contains water quality data
# Check whether a wildcard was used and more than one xml file is available
xml_file_string = os.path.join(
settingsDict["Path to output directory"],
settingsDict["Name of output directory"],
settingsDict["Input file"],
)
xml_list = glob(xml_file_string)
if xml_list == []:
print("Empty xml file list. Check path for xml file.")
print("xml file string =", xml_file_string)
return -1
n_xml = len(xml_list)
if n_xml > 1:
for xml_file in xml_list:
WQXtoPandas(
xml_file,
charDict,
outputPath=settingsDict["Path to output directory"],
outputDirName=settingsDict["Name of output directory"],
fromFile=True,
RUN_PHREEQC=RUN_PHREEQC,
bracket_charge_balance=bracket_charge_balance,
PHREEQC_PATH=settingsDict["Path to PHREEQC"],
DATABASE_FILE=DATABASE_FILE,
LOG_FILE=LOG_FILE,
START_FILE=startfilename,
)
else:
WQXtoPandas(
# settingsDict["Input file"], # appears to be a path bug
xml_list[0],
charDict,
outputPath=settingsDict["Path to output directory"],
outputDirName=settingsDict["Name of output directory"],
fromFile=True,
RUN_PHREEQC=RUN_PHREEQC,
bracket_charge_balance=bracket_charge_balance,
PHREEQC_PATH=settingsDict["Path to PHREEQC"],
DATABASE_FILE=DATABASE_FILE,
LOG_FILE=LOG_FILE,
START_FILE=startfilename,
)
elif settingsDict["Input method"] == "2":
# We will use a list of sites from a NWIS XML file and query these
# sites for water quality data
# First extract site list from XML file
try:
siteList = extractSitesFromXML(settingsDict["Input file"])
except IOError:
print(
"Problem extracting sites from XML file "
+ settingsDict["Input file"]
+ " check to see if file name is correct and file is in right location."
)
return -1
charList = []
# collect list of characteristics to query
for key in charDict.keys():
charList.append(str(key))
if len(siteList) > autosplitnum:
# We have too long of a list and should split into multiple queries
n_groups = int(ceil(len(siteList) / float(autosplitnum)))
for i in range(n_groups): # this doesn't work for even division cases
shortList = siteList[
i * autosplitnum : i * autosplitnum + autosplitnum
]
queryText = querySiteList(shortList, charList)
if queryText != None:
WQXtoPandas(
queryText,
charDict,
outputPath=settingsDict["Path to output directory"],
outputDirName=settingsDict["Name of output directory"],
fromFile=False,
RUN_PHREEQC=RUN_PHREEQC,
bracket_charge_balance=bracket_charge_balance,
PHREEQC_PATH=settingsDict["Path to PHREEQC"],
DATABASE_FILE=DATABASE_FILE,
splittag="." + str(i),
LOG_FILE=LOG_FILE,
START_FILE=startfilename,
restart=restart,
)
else:
# get html for query
queryText = querySiteList(siteList, charList)
if queryText != None:
WQXtoPandas(
queryText,
charDict,
outputPath=settingsDict["Path to output directory"],
outputDirName=settingsDict["Name of output directory"],
fromFile=False,
RUN_PHREEQC=RUN_PHREEQC,
bracket_charge_balance=bracket_charge_balance,
PHREEQC_PATH=settingsDict["Path to PHREEQC"],
DATABASE_FILE=DATABASE_FILE,
LOG_FILE=LOG_FILE,
START_FILE=startfilename,
)
elif settingsDict["Input method"] == "3":
# We will use a list of sites from a text file and query these
# sites for water quality data
# First extract site list from text file
try:
siteList = extractSitesFromText(settingsDict["Input file"])
except IOError:
print(
"Problem extracting sites from text file "
+ settingsDict["Input file"]
+ " check to see if file name is correct and file is in right location."
)
return -1
if siteList != -1:
charList = []
# collect list of characteristics to query
for key in charDict.keys():
charList.append(str(key))
if len(siteList) > autosplitnum:
# We have too long of a list and should split into multiple queries
n_groups = int(ceil(len(siteList) / float(autosplitnum)))
for i in range(n_groups):
shortList = siteList[
i * autosplitnum : i * autosplitnum + autosplitnum
]
queryText = querySiteList(shortList, charList)
if queryText != None:
WQXtoPandas(
queryText,
charDict,
outputPath=settingsDict["Path to output directory"],
outputDirName=settingsDict["Name of output directory"],
fromFile=False,
RUN_PHREEQC=RUN_PHREEQC,
bracket_charge_balance=bracket_charge_balance,
PHREEQC_PATH=settingsDict["Path to PHREEQC"],
splittag="." + str(i),
DATABASE_FILE=DATABASE_FILE,
LOG_FILE=LOG_FILE,
START_FILE=startfilename,
restart=restart,
)
else:
# get html for query
queryText = querySiteList(siteList, charList)
if queryText != None:
WQXtoPandas(
queryText,
charDict,
outputPath=settingsDict["Path to output directory"],
outputDirName=settingsDict["Name of output directory"],
fromFile=False,
RUN_PHREEQC=RUN_PHREEQC,
bracket_charge_balance=bracket_charge_balance,
PHREEQC_PATH=settingsDict["Path to PHREEQC"],
DATABASE_FILE=DATABASE_FILE,
LOG_FILE=LOG_FILE,
START_FILE=startfilename,
)
else:
print("Problem obtaining site list.")
else:
print(
(
'Problem with "Input Method" of start file: '
+ settingsDict["Input method"]
)
)
except IOError:
print("Problem reading start file. Check file name.")
# Run as script
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-r", "--restart", action="store_true")
parser.add_argument("-a", "--autosplitnum")
parser.add_argument("startfilename")
args = parser.parse_args()
startfilename = args.startfilename
if args.autosplitnum is not None:
runWQXtoPandas(
startfilename, autosplitnum=args.autosplitnum, restart=args.restart
)
else:
runWQXtoPandas(startfilename, restart=args.restart)