Source code for olm.USGS.loadWaterQualityData

"""
Functions to load water quality data that has been processed and pickled by WQXtoPHREEQC
"""
import os
import cPickle as pickle
from pandas.io.pickle import read_pickle
from siteListExtraction import *
from glob import glob

DEFAULT_DIR = './Processed-Sites'

# siteListText - list of sites separated by semi-colons
# siteFile - text file with list of sites
# regEx - a wildcard expression to use in directory
[docs]def loadSiteListData(siteListText = None, siteFile = None, regEx = 'USGS-*', processedSitesDir = DEFAULT_DIR, loadPhreeqc = False, loadMetaData = False ): """ Retrieves site data for multiple sites within a processed sites directory. Parameters ---------- siteListText : string (optional) a list of sites separated by semi-colons siteFile : string (optional) a filename of a text file with a list of sites regEx : string (optional) regular expression used to search for site directories within the processed sites directory (default = 'USGS-') processedSitesDir : string (optional) directory that contains all of the processed site directories. It is important to change this if the default is not correct. (default='./Processed-Sites') loadPhreeqc : boolean If set to true, PHREEQC outputs will also be loaded for each site. (default=False) loadMetaData : boolean If set to true, the site metadata will be loaded for each site. (default=False) Returns ------- sitesDict : dict A dictionary of site data panels keyed by site name. or if loadPhreeqc or loadMetaData are set to true (sitesDict, sitesPheeqcDict, sitesMetaDataDict) : tuple A tuple containing the sitesDict and dicts of the PHREEQC data and/or metadata for each site. Order is as shown. """ siteList = -1 #If the needed data is provided to find the site list then use it if not(siteListText == None): #check whether we have a valid site directory processedSitesDir = checkSitesDir(processedSitesDir) siteList = siteListFromLine(siteListText, processedSitesDir = processedSitesDir) elif not(siteFile == None): #check whether we have a valid site directory processedSitesDir = checkSitesDir(processedSitesDir) siteList = siteListFromFile(siteFile, processedSitesDir = processedSitesDir) elif not(regEx == None): #check whether we have a valid site directory processedSitesDir = checkSitesDir(processedSitesDir) siteList = siteListFromRegEx(regEx, processedSitesDir = processedSitesDir) else: # if not provided then query user for needed data processedSitesInput = raw_input("Path of the processed sites directory (Default = ./Processed-Sites): ") if (processedSitesInput != ''): processedSitesDir = processedSitesInput print(processedSitesDir) processedSitesDir = checkSitesDir(processedSitesDir) modeOK = False while not(modeOK): mode = raw_input("Do you want to: \n \t 1) enter a semi-colon separated list of sites \n \t 2) provide a text file of sites \n \t \n\t 3) provide an XML list of sites \n \t 4) provide a wildcard expression to obtain sites from directory list\n Enter 1, 2, 3, or 4: ") if mode.isdigit(): if ( (int(mode) > 0) and (int(mode) < 5) ): modeOK = True else: print("Invalid input") else: print("Invalid input") if (int(mode) == 1): siteListText = raw_input("Enter list of sites separated by semi-colons: ") siteList = siteListFromLine(siteListText) elif (int(mode) == 2): siteFile = raw_input("Enter path to text file containing site list: ") siteList = siteListFromFile(siteFile) elif (int(mode) == 3): siteFile = raw_input("Enter path to XML file containing site list: ") siteList = siteListFromFile(siteFile, XML=True) elif (int(mode) == 4): regEx = raw_input("Enter regular expression: ") siteList = siteListFromRegEx(regEx) if (siteList != -1): #process the sites in the list sitesDict = {} sitesPhreeqcDict = {} sitesMetaDataDict = {} for site in siteList: sitePanel = loadSiteData(site, processedSitesDir = processedSitesDir) if sitePanel is not None: #If site data does not read in correctly, loadSiteData returns None sitesDict[site] = sitePanel if loadPhreeqc: sitedf = loadSitePhreeqcData(site, processedSitesDir = processedSitesDir) sitesPhreeqcDict[site] = sitedf if loadMetaData: siteMetaData = loadSiteMetaData(site, processedSitesDir = processedSitesDir) sitesMetaDataDict[site] = siteMetaData if loadPhreeqc or loadMetaData: return_list = [sitesDict] if loadPhreeqc: return_list.append(sitesPhreeqcDict) if loadMetaData: return_list.append(sitesMetaDataDict) return tuple(return_list) else: return sitesDict
def loadSiteMetaData(site, processedSitesDir = DEFAULT_DIR): #Add USGS tag if needed if not(site.startswith('USGS-')): site = 'USGS-'+site try: metaDataFile = os.path.join(processedSitesDir, site, site+'-Site-Description.pkl') siteMetaData = pickle.load(open(metaDataFile, 'rb')) except IOError: print ("Problem reading pickle file: " + panelFile ) return None return siteMetaData
[docs]def loadSiteData(site, processedSitesDir = DEFAULT_DIR): """ Retrieves site data for an individual site from a directory of processed sites. Parameters ---------- site : string name of site to retrieve, with or without USGS- tag at beginning. processedSitesDir : string (optional) directory that contains the processed site directory associated with the desired site. It is important to change this if the default is not correct. (default='./Processed-Sites') Returns ------- sitePanel : pandas.core.panel.Panel A pandas panel object with data from the requested site. """ #Add USGS tag if needed if not(site.startswith('USGS-')): site = 'USGS-'+site try: panelFile = os.path.join(processedSitesDir, site, site+'-Panel.pkl') # sitePanel = pickle.load(open(panelFile, 'rb')) sitePanel = read_pickle(panelFile) except IOError: print ("Problem reading pickle file: " + panelFile ) return None return sitePanel
[docs]def loadSitePhreeqcData(site, processedSitesDir = DEFAULT_DIR): """ Retrieves site PHREEQC data for an individual site from a directory of processed sites. Parameters ---------- site : string name of site to retrieve, with or without USGS- tag at beginning. processedSitesDir : string (optional) directory that contains the processed site directory associated with the desired site. It is important to change this if the default is not correct. (default='./Processed-Sites') Returns ------- sitedf : pandas.core.frame.DataFrame A pandas dataframe object with PHREEQC data from the requested site. """ #Add USGS tag if needed if not(site.startswith('USGS-')): site = 'USGS-'+site try: phreeqcFile = os.path.join(processedSitesDir, site, site+'-PHREEQC.pkl') sitedf = read_pickle(phreeqcFile) except IOError: print ("Problem reading pickle file: " + phreeqcFile ) return None return sitedf
def siteListFromLine(siteListText): siteList = siteListText.split(';') siteList = [x.strip() for x in siteList] #check for USGS Tag at beginning of site number for i, site in enumerate(siteList): if not(site.startswith('USGS-')): siteList[i] = 'USGS-' + siteList[i] return siteList def siteListFromFile(siteFile, sitesDir = DEFAULT_DIR, XML=False): if (siteFile.endswith('.xml') or (XML == True)): siteList = extractSitesFromXML(siteFile) else: siteList = extractSitesFromText(siteFile) #check for USGS Tag at beginning of site number for i, site in enumerate(siteList): if not(site.startswith('USGS-')): siteList[i] = 'USGS-' + siteList[i] # siteList = [os.path.join(processedSitesDir, x) for x in siteList] return siteList def siteListFromRegEx(regEx, processedSitesDir = DEFAULT_DIR): # print("processedSitesDir="+processedSitesDir) listText = os.path.join(processedSitesDir, regEx) sitePath = glob(listText) siteList = [] for site in sitePath: head,tail = os.path.split(site) siteList.append(tail) return siteList def checkSitesDir(processedSitesDir): while not os.path.exists(processedSitesDir): print("Invalid path to processed sites directory.") processedSitesDir = raw_input("Path of the processed sites directory (Default = ./Processed-Sites): ") return processedSitesDir