Source code for olm.USGS.loadWaterQualityData

"""
Functions to load water quality data that has been processed and pickled by WQXtoPHREEQC
"""
import os
import pickle as pickle
from pandas.io.pickle import read_pickle
from .siteListExtraction import extractSitesFromXML, extractSitesFromText
from glob import glob

DEFAULT_DIR = "./Processed-Sites"


[docs] def loadSiteListData( siteListText=None, siteFile=None, regEx="USGS-*", processedSitesDir=DEFAULT_DIR, loadPhreeqc=False, loadMetaData=False, ): """ Retrieves site data for multiple sites within a processed sites directory. Parameters ---------- siteListText : string (optional) a list of sites separated by semi-colons siteFile : string (optional) a filename of a text file with a list of sites regEx : string (optional) regular expression used to search for site directories within the processed sites directory (default = 'USGS-'). processedSitesDir : string (optional) directory that contains all of the processed site directories. It is important to change this if the default is not correct. (default='./Processed-Sites') loadPhreeqc : boolean If set to true, PHREEQC outputs will also be loaded for each site. (default=False) loadMetaData : boolean If set to true, the site metadata will be loaded for each site. (default=False) Returns ------- sitesDict : dict A dictionary of site DataFrames keyed by site name. or if loadPhreeqc or loadMetaData are set to true (sitesDict, sitesPheeqcDict, sitesMetaDataDict) : tuple A tuple containing the sitesDict and dicts of the PHREEQC data and/or metadata for each site. Order is as shown. """ siteList = -1 # If the needed data is provided to find the site list then use it if not (siteListText is None): # check whether we have a valid site directory processedSitesDir = checkSitesDir(processedSitesDir) siteList = siteListFromLine(siteListText) elif not (siteFile is None): # check whether we have a valid site directory processedSitesDir = checkSitesDir(processedSitesDir) siteList = siteListFromFile(siteFile) elif not (regEx is None): # check whether we have a valid site directory processedSitesDir = checkSitesDir(processedSitesDir) siteList = siteListFromRegEx(regEx, processedSitesDir=processedSitesDir) else: # if not provided then query user for needed data processedSitesInput = input( "Path of the processed sites directory (Default = ./Processed-Sites): " ) if processedSitesInput != "": processedSitesDir = processedSitesInput print(processedSitesDir) processedSitesDir = checkSitesDir(processedSitesDir) modeOK = False while not (modeOK): mode = input( "Do you want to: \n \t 1) enter a semi-colon separated list of sites \n \t 2) provide a text file of sites \n \t \n\t 3) provide an XML list of sites \n \t 4) provide a wildcard expression to obtain sites from directory list\n Enter 1, 2, 3, or 4: " ) if mode.isdigit(): if (int(mode) > 0) and (int(mode) < 5): modeOK = True else: print("Invalid input") else: print("Invalid input") if int(mode) == 1: siteListText = input("Enter list of sites separated by semi-colons: ") siteList = siteListFromLine(siteListText) elif int(mode) == 2: siteFile = input("Enter path to text file containing site list: ") siteList = siteListFromFile(siteFile) elif int(mode) == 3: siteFile = input("Enter path to XML file containing site list: ") siteList = siteListFromFile(siteFile, XML=True) elif int(mode) == 4: regEx = input("Enter regular expression: ") siteList = siteListFromRegEx(regEx) if siteList != -1: # process the sites in the list sitesDict = {} sitesPhreeqcDict = {} sitesMetaDataDict = {} for site in siteList: siteFrame = loadSiteData(site, processedSitesDir=processedSitesDir) if ( siteFrame is not None ): # If site data does not read in correctly, loadSiteData returns None sitesDict[site] = siteFrame if loadPhreeqc: sitedf = loadSitePhreeqcData( site, processedSitesDir=processedSitesDir ) sitesPhreeqcDict[site] = sitedf if loadMetaData: siteMetaData = loadSiteMetaData( site, processedSitesDir=processedSitesDir ) sitesMetaDataDict[site] = siteMetaData if loadPhreeqc or loadMetaData: return_list = [sitesDict] if loadPhreeqc: return_list.append(sitesPhreeqcDict) if loadMetaData: return_list.append(sitesMetaDataDict) return tuple(return_list) else: return sitesDict
def loadSiteMetaData(site, processedSitesDir=DEFAULT_DIR): # Add USGS tag if needed # if not(site.startswith('USGS-')): # site = 'USGS-'+site try: metaDataFile = os.path.join( processedSitesDir, site, site + "-Site-Description.pkl" ) siteMetaData = pickle.load(open(metaDataFile, "rb")) except IOError: print(("Problem reading pickle file: " + metaDataFile)) return None return siteMetaData
[docs] def loadSiteData(site, processedSitesDir=DEFAULT_DIR): """ Retrieves site data for an individual site from a directory of processed sites. Parameters ---------- site : string name of site to retrieve, with or without USGS- tag at beginning. processedSitesDir : string (optional) directory that contains the processed site directory associated with the desired site. It is important to change this if the default is not correct. (default='./Processed-Sites') Returns ------- siteDataFrame : pandas.core.dataframe.DataFrame A pandas multiindexed DataFrame object with data and metadata from the requested site. """ # Add USGS tag if needed # if not(site.startswith('USGS-')): # site = 'USGS-'+site try: frameFile = os.path.join(processedSitesDir, site, site + "-Dataframe.pkl") siteFrame = read_pickle(frameFile) except IOError: print(("Problem reading pickle file: " + frameFile)) return None return siteFrame
[docs] def loadSitePhreeqcData(site, processedSitesDir=DEFAULT_DIR): """ Retrieves site PHREEQC data for an individual site from a directory of processed sites. Parameters ---------- site : string name of site to retrieve, with or without USGS- tag at beginning. processedSitesDir : string (optional) directory that contains the processed site directory associated with the desired site. It is important to change this if the default is not correct. (default='./Processed-Sites') Returns ------- sitedf : pandas.core.frame.DataFrame A pandas dataframe object with PHREEQC data from the requested site. """ # Add USGS tag if needed # if not(site.startswith('USGS-')): # site = 'USGS-'+site try: phreeqcFile = os.path.join(processedSitesDir, site, site + "-PHREEQC.pkl") sitedf = read_pickle(phreeqcFile) except IOError: print(("Problem reading pickle file: " + phreeqcFile)) return None return sitedf
def siteListFromLine(siteListText): siteList = siteListText.split(";") siteList = [x.strip() for x in siteList] # check for USGS Tag at beginning of site number # for i, site in enumerate(siteList): # if not(site.startswith('USGS-')): # siteList[i] = 'USGS-' + siteList[i] return siteList def siteListFromFile(siteFile, sitesDir=DEFAULT_DIR, XML=False): if siteFile.endswith(".xml") or (XML == True): siteList = extractSitesFromXML(siteFile) else: siteList = extractSitesFromText(siteFile) # check for USGS Tag at beginning of site number # for i, site in enumerate(siteList): # if not(site.startswith('USGS-')): # siteList[i] = 'USGS-' + siteList[i] # siteList = [os.path.join(processedSitesDir, x) for x in siteList] return siteList def siteListFromRegEx(regEx, processedSitesDir=DEFAULT_DIR): # print("processedSitesDir="+processedSitesDir) listText = os.path.join(processedSitesDir, regEx) sitePath = glob(listText) siteList = [] for site in sitePath: if os.path.isdir(site): head, tail = os.path.split(site) siteList.append(tail) return siteList def checkSitesDir(processedSitesDir): while not os.path.exists(processedSitesDir): print("Invalid path to processed sites directory.") processedSitesDir = input( "Path of the processed sites directory (Default = ./Processed-Sites): " ) return processedSitesDir