Source code for olm.USGS.loadWaterQualityData

"""
Functions to load water quality data that has been processed and pickled by WQXtoPHREEQC
"""
import os
import cPickle as pickle
from pandas.io.pickle import read_pickle
from siteListExtraction import *
from glob import glob

DEFAULT_DIR = './Processed-Sites'

# siteListText - list of sites separated by semi-colons
# siteFile - text file with list of sites
# regEx - a wildcard expression to use in directory
[docs]def loadSiteListData(siteListText = None,
                     siteFile = None, 
                     regEx = 'USGS-*', 
                     processedSitesDir = DEFAULT_DIR,
                     loadPhreeqc = False,
                     loadMetaData = False
                     ):
    """
    Retrieves site data for multiple sites within a processed sites directory.

    Parameters
    ----------
    siteListText : string (optional)
        a list of sites separated by semi-colons

    siteFile : string (optional)
        a filename of a text file with a list of sites

    regEx : string (optional)
        regular expression used to search for site directories within the processed sites directory (default = 'USGS-')

    processedSitesDir : string (optional)
        directory that contains all of the processed site directories. It is important to change this if the default is not correct. (default='./Processed-Sites')
        
    loadPhreeqc : boolean
        If set to true, PHREEQC outputs will also be loaded for each site. (default=False)
        
    loadMetaData : boolean
        If set to true, the site metadata will be loaded for each site. (default=False)

    Returns
    -------
    sitesDict : dict
        A dictionary of site data panels keyed by site name.

    or if loadPhreeqc or loadMetaData are set to true
    (sitesDict, sitesPheeqcDict, sitesMetaDataDict) : tuple
       A tuple containing the sitesDict and dicts of the PHREEQC data and/or metadata for each site. Order is as shown.

    """
    siteList = -1
    #If the needed data is provided to find the site list then use it
    if not(siteListText == None):
        #check whether we have a valid site directory
        processedSitesDir = checkSitesDir(processedSitesDir)
        siteList = siteListFromLine(siteListText, processedSitesDir = processedSitesDir)
    elif not(siteFile == None):
        #check whether we have a valid site directory
        processedSitesDir = checkSitesDir(processedSitesDir)
        siteList = siteListFromFile(siteFile, processedSitesDir = processedSitesDir)
    elif not(regEx == None):
        #check whether we have a valid site directory
        processedSitesDir = checkSitesDir(processedSitesDir)
        siteList = siteListFromRegEx(regEx, processedSitesDir = processedSitesDir)
    else:
        # if not provided then query user for needed data
        processedSitesInput = raw_input("Path of the processed sites directory (Default = ./Processed-Sites): ")
        if (processedSitesInput != ''):
            processedSitesDir = processedSitesInput
        print(processedSitesDir)
        processedSitesDir = checkSitesDir(processedSitesDir)
        modeOK = False
        while not(modeOK):
            mode = raw_input("Do you want to: \n \t 1) enter a semi-colon separated list of sites \n \t 2) provide a text file of sites \n \t \n\t 3) provide an XML list of sites \n \t 4) provide a wildcard expression to obtain sites from directory list\n Enter 1, 2, 3, or 4: ")
            if mode.isdigit():
                if ( (int(mode) > 0) and (int(mode) < 5) ):
                    modeOK = True
                else:
                    print("Invalid input")
            else:
                print("Invalid input")
        if (int(mode) == 1):
            siteListText = raw_input("Enter list of sites separated by semi-colons: ")
            siteList = siteListFromLine(siteListText)
        elif (int(mode) == 2):
            siteFile = raw_input("Enter path to text file containing site list: ")
            siteList = siteListFromFile(siteFile)
        elif (int(mode) == 3):
            siteFile = raw_input("Enter path to XML file containing site list: ")
            siteList = siteListFromFile(siteFile, XML=True)
        elif (int(mode) == 4):
            regEx = raw_input("Enter regular expression: ")
            siteList = siteListFromRegEx(regEx)
    if (siteList != -1):
        #process the sites in the list
        
        sitesDict = {}
        sitesPhreeqcDict = {}
        sitesMetaDataDict = {}
        for site in siteList:            
            sitePanel = loadSiteData(site, processedSitesDir = processedSitesDir)
            if sitePanel is not None: #If site data does not read in correctly, loadSiteData returns None
                sitesDict[site] = sitePanel
                if loadPhreeqc:
                    sitedf = loadSitePhreeqcData(site, processedSitesDir = processedSitesDir)
                    sitesPhreeqcDict[site] = sitedf
                if loadMetaData:
                    siteMetaData = loadSiteMetaData(site, processedSitesDir = processedSitesDir)
                    sitesMetaDataDict[site] = siteMetaData
        if loadPhreeqc or loadMetaData:
            return_list = [sitesDict]
            if loadPhreeqc:
                return_list.append(sitesPhreeqcDict)
            if loadMetaData:
                return_list.append(sitesMetaDataDict)
            return tuple(return_list)
        else:
            return sitesDict

def loadSiteMetaData(site, processedSitesDir = DEFAULT_DIR):
    #Add USGS tag if needed
    if not(site.startswith('USGS-')):
        site = 'USGS-'+site
    try:
        metaDataFile = os.path.join(processedSitesDir, site, site+'-Site-Description.pkl')
        siteMetaData = pickle.load(open(metaDataFile, 'rb'))
    except IOError:
        print ("Problem reading pickle file: " + panelFile )
        return None
    return siteMetaData
    


[docs]def loadSiteData(site, processedSitesDir = DEFAULT_DIR):
    """
    Retrieves site data for an individual site from a directory of processed sites.

    Parameters
    ----------
    site : string
        name of site to retrieve, with or without USGS- tag at beginning.

    processedSitesDir : string (optional)
        directory that contains the processed site directory associated with the desired site. It is important to change this if the default is not correct. (default='./Processed-Sites')

    Returns
    -------
    sitePanel : pandas.core.panel.Panel
        A pandas panel object with data from the requested site.

    """
    #Add USGS tag if needed
    if not(site.startswith('USGS-')):
        site = 'USGS-'+site
    try:
        panelFile = os.path.join(processedSitesDir, site, site+'-Panel.pkl')
#        sitePanel = pickle.load(open(panelFile, 'rb'))
        sitePanel = read_pickle(panelFile)
    except IOError:
        print ("Problem reading pickle file: " + panelFile )
        return None
    return sitePanel

[docs]def loadSitePhreeqcData(site, processedSitesDir = DEFAULT_DIR):
    """
    Retrieves site PHREEQC data for an individual site from a directory of processed sites.

    Parameters
    ----------
    site : string
        name of site to retrieve, with or without USGS- tag at beginning.

    processedSitesDir : string (optional)
        directory that contains the processed site directory associated with the desired site. It is important to change this if the default is not correct. (default='./Processed-Sites')

    Returns
    -------
    sitedf : pandas.core.frame.DataFrame
        A pandas dataframe object with PHREEQC data from the requested site.

    """
    #Add USGS tag if needed
    if not(site.startswith('USGS-')):
        site = 'USGS-'+site
    try:
        phreeqcFile = os.path.join(processedSitesDir, site, site+'-PHREEQC.pkl')
        sitedf = read_pickle(phreeqcFile)
    except IOError:
        print ("Problem reading pickle file: " + phreeqcFile )
        return None
    return sitedf
            
def siteListFromLine(siteListText):
    siteList = siteListText.split(';')
    siteList = [x.strip() for x in siteList]
    #check for USGS Tag at beginning of site number
    for i, site in enumerate(siteList):
        if not(site.startswith('USGS-')):
            siteList[i] = 'USGS-' + siteList[i]
    return siteList

def siteListFromFile(siteFile, 
                     sitesDir = DEFAULT_DIR,
                     XML=False):
    if (siteFile.endswith('.xml') or (XML == True)):
        siteList = extractSitesFromXML(siteFile)
    else:
        siteList = extractSitesFromText(siteFile)
    #check for USGS Tag at beginning of site number
    for i, site in enumerate(siteList):
        if not(site.startswith('USGS-')):
            siteList[i] = 'USGS-' + siteList[i]
 #   siteList = [os.path.join(processedSitesDir, x) for x in siteList]
    return siteList

def siteListFromRegEx(regEx,
                      processedSitesDir = DEFAULT_DIR):
#    print("processedSitesDir="+processedSitesDir)
    listText = os.path.join(processedSitesDir, regEx)
    sitePath = glob(listText)
    siteList = []
    for site in sitePath:
        head,tail = os.path.split(site)
        siteList.append(tail)
    return siteList

def checkSitesDir(processedSitesDir):
    while not os.path.exists(processedSitesDir):
        print("Invalid path to processed sites directory.")
        processedSitesDir = raw_input("Path of the processed sites directory (Default = ./Processed-Sites): ")
    return processedSitesDir