Source code for pysatMadrigal.instruments.methods.general

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Full license can be found in License.md
# Full author list can be found in .zenodo.json file
# DOI:10.5281/zenodo.3824979
#
# DISTRIBUTION STATEMENT A: Approved for public release. Distribution is
# unlimited.
# ----------------------------------------------------------------------------
"""General routines for integrating CEDAR Madrigal instruments into pysat."""

import datetime as dt
import gzip
import numpy as np
import os
import pandas as pds
import xarray as xr

import h5py
import pysat

from madrigalWeb import madrigalWeb


file_types = {'hdf5': 'hdf5', 'netCDF4': 'netCDF4', 'simple': 'simple.gz'}


[docs] def cedar_rules(): """General acknowledgement statement for Madrigal data. Returns ------- ackn : str String with general acknowledgement for all CEDAR Madrigal data """ ackn = "".join(["Contact the PI when using this data, in accordance ", "with the CEDAR 'Rules of the Road'"]) return ackn
[docs] def known_madrigal_inst_codes(pandas_format=None): """Supply known Madrigal instrument codes with a brief description. Parameters ---------- pandas_format : bool or NoneType Separate instrument codes by time-series (True) or multi-dimensional data types (False) if a boolean is supplied, or supply all if NoneType (default=None) Returns ------- inst_codes : dict Dictionary with string instrument code values as keys and a brief description of the corresponding instrument as the value. """ time_series = {'120': 'Interplanetary Mag Field and Solar Wind', '210': 'Geophysical Indicies', '211': 'AE Index', '212': 'DST Index', '170': 'POES Spacecraft Particle Flux', '180': 'DMSP-Auroral Boundary Index', '8100': 'Defense Meteorological Satellite Program', '8105': 'Van Allen Probes', '8400': 'Jason/Topex Ocean TEC', '8250': 'Jicamarca Magnetometer', '8255': 'Piura Magnetometer', '8300': 'Sodankyla Magnetometer', '7800': 'Green Bank Telescope'} multi_dim = {'10': 'Jicamarca ISR', '20': 'Arecibo ISR Linefeed', '21': 'Arecibo ISR Gregorian', '22': 'Arecibo ISR Velocity Vector', '25': 'MU ISR', '30': 'Millstone Hill ISR', '31': 'Millstone Hill UHF Steerable Antenna', '32': 'Millstone Hill UHF Zenith Antenna', '40': 'St. Santin ISR', '41': 'St. Santin Nançay Receiver', '42': 'St. Santin Mende Receiver', '43': 'St. Santin Monpazier Receiver', '45': 'Kharkov Ukraine ISR', '50': 'Chatanika ISR', '53': 'ISTP Irkutsk Radar', '57': 'UK Malvern ISR', '61': 'Poker Flat ISR', '70': 'EISCAT combined ISRs', '71': 'EISCAT Kiruna UHF ISR', '72': 'EISCAT Tromsø UHF ISR', '73': 'EISCAT Sodankylä UHF ISR', '74': 'EISCAT Tromsø VHF ISR', '75': 'EISCAT Kiruna VHF ISR', '76': 'EISCAT Sodankylä VHF ISR', '80': 'Sondrestrom ISR', '85': 'ALTAIR ISR', '91': 'Resolute Bay North ISR', '92': 'Resolute Bay Canada ISR', '95': 'EISCAT Svalbard ISR Longyearbyen', '100': 'QuJing ISR', '310': 'TGCM/TIGCM model', '311': 'AMIE Model', '312': 'USU-TDIM Model', '320': 'Solar sd Tides', '321': 'Lunar sd Tides', '322': 'GSWM model', '820': 'Halley HF Radar', '830': 'Syowa Station HF Radar', '845': 'Kapuskasing HF Radar', '861': 'Saskatoon HF Radar', '870': 'Goose Bay HF Radar', '900': 'Hankasalmi HF Radar', '910': 'Stokkseyri HF Radar', '911': 'Pykkvibaer HF Radar', '1040': 'Arecibo MST Radar', '1140': 'Poker Flat MST Radar', '1180': 'SOUSY Svalbard MST Radar Longyearbyen', '1210': 'Scott Base MF Radar', '1215': 'Davis Antarctica MF radar', '1220': 'Mawson MF Radar', '1221': 'Rothera MF radar', '1230': 'Christchurch MF Radar', '1240': 'Adelaide MF Radar', '1245': 'Rarotonga MF radar', '1254': 'Tirunelveli MF radar', '1270': 'Kauai MF radar', '1275': 'Yamagawa MF radar', '1285': 'Platteville MF radar', '1310': 'Wakkanai MF radar', '1320': 'Collm LF Radar', '1340': 'Saskatoon MF Radar', '1375': 'The Poker Flat MF radar', '1390': 'Tromsø MF Radar', '1395': 'Syowa MF Radar', '1400': 'Halley MF Radar', '13': 'JASMET Jicamarca All-Sky Specular Meteor Radar', '1539': 'Ascension Island Meteor Radar', '1540': 'Rothera Meteor Radar', '1560': 'Atlanta meteor Radar', '1620': 'Durham meteor Radar', '1750': 'Obninsk meteor radar', '1775': 'Esrange meteor radar', '1780': 'Wuhan meteor radar', '1781': 'Mohe meteor radar', '1782': 'Beijing meteor radar', '1783': 'Sanya meteor radar', '1784': 'South Pole meteor radar', '1785': 'Southern Argentina Agile Meteor Radar', '1786': 'Cachoeira Paulista Meteor Radar', '1787': 'Buckland Park Meteor Radar', '1788': 'Kingston Meteor Radar', '1790': 'Andes Meteor Radar', '1791': 'Southern Cross Meteor Radar', '1792': 'Las Campanas Meteor Radar', '1793': 'CONDOR multi-static meteor radar system', '2090': 'Christmas Island ST/MEDAC Radar', '2200': 'Platteville ST/MEDAC Radar', '2550': 'ULowell Digisonde MLH Radar', '2890': 'Sondre Stromfjord Digisonde', '2900': 'Sodankylä Ionosonde (SO166)', '2930': 'Qaanaaq Digisonde ST/MEDAC Radars', '2950': 'EISCAT Tromsø Dynasonde', '2951': 'EISCAT Svalbard Dynasonde', '2952': 'IRF Dynasonde at EISCAT site Kiruna', '5000': 'South Pole Fabry-Perot', '5005': 'Palmer Fabry Perot', '5015': 'Arrival Heights Fabry-Perot', '5020': 'Halley Fabry-Perot', '5060': 'Mount John Fabry-Perot', '5140': 'Fabry-Perot Arequipa', '5145': 'Fabry-Perot Jicamarca', '5150': 'Fabry-Perot Mobile', '5160': 'Arecibo Fabry-Perot', '5190': 'Kitt Peak H-alpha Fabry-Perot', '5240': 'Fritz Peak Fabry-Perot', '5292': 'Ann Arbor Fabry-Perot', '5300': 'Peach Mountain Fabry-Perot', '5340': 'Millstone Hill Fabry-Perot', '5360': 'Millstone Hill High-Res Fabry-Perot', '5370': 'Arecibo Imaging Doppler Fabry-Perot', '5380': 'Culebra Fabry-Perot', '5430': 'Watson Lake Fabry-Perot', '5460': 'College Fabry-Perot', '5465': 'Poker Flat all-sky scanning Fabry-Perot', '5470': 'Fort Yukon Fabry-Perot', '5475': 'Poker Flat Fabry-Perot', '5480': 'Sondre Stromfjord Fabry-Perots', '5510': 'Inuvik NWT Fabry-Perot', '5535': 'Resolute Bay Fabry-Perot', '5540': 'Thule Fabry-Perot', '5545': 'Cariri Brazil FPI', '5546': 'Cajazeiras Brazil FPI', '5547': 'Pisgah Astronomical Research FPI', '5548': 'Urbana Atmospheric Observatory FPI', '5549': 'Kirtland Airforce Base FPI', '5550': 'Virginia Tech FPI', '5551': 'Peach Mountain (MiniME) FPI', '5552': 'Merihill Peru FPI', '5553': 'Nazca Peru FPI', '5554': 'Eastern Kentucky FPI', '5600': 'Jang Bogo Station FPI', '5700': 'South Pole Michelson Interferometer', '5720': 'Daytona Beach Michelson Interferometer', '5860': 'Stockholm IR Michelson', '5900': 'Sondrestrom Michelson Interferometer', '5950': 'Resolute Bay Michelson Interferometer', '5980': 'Eureka Michelson Interferometer', '6205': 'Arecibo Potassium [K] lidar', '6206': 'Arecibo Sodium [Na] lidar', '6300': 'CEDAR lidar', '6320': 'Colorado State sodium lidar', '6330': 'Rayleigh lidar at the ALO - USU/CASS', '6340': 'Andes Na T/W Lidar', '6350': 'ALOMAR Sodium Lidar', '6360': 'CU STAR Sodium Lidar', '6370': 'USU Na lidar', '6380': 'Poker Flat lidar', '7190': 'USU CCD Imager', '7192': 'USU Advanced Mesospheric Temperature Mapper', '7200': 'BU Millstone All-Sky Imager', '7201': 'BU Arecibo All-Sky Imager', '7202': 'BU Asiago All-Sky Imager', '7203': 'BU El Leoncito All-Sky Imager', '7204': 'BU McDonald All-Sky Imager', '7205': 'BU Rio Grande All-Sky Imager', '7206': 'BU Jicamarca All-Sky Imager', '7240': 'MIO', '7580': 'All-sky cameras at Qaanaaq', '11': 'Jicamarca Bistatic Radar', '840': 'JULIA', '3000': 'ARL UT TBB Receiver', '7600': 'Chelmsford HS Ozone Radiometer', '7602': 'Lancaster UK Ozone Radiometer', '7603': 'Bridgewater MA Ozone Radiometer', '7604': 'Union College Ozone Radiometer', '7605': 'UNC Greensboro Ozone Radiometer', '7606': 'Lynnfield HS Ozone Radiometer', '7607': 'Alaska Pacific Ozone Radiometer', '7608': 'Hermanus SA Ozone Radiometer', '7609': 'Sanae Antarctic Ozone Radiometer', '7610': 'Sodankylä Ozone Radiometer', '7611': 'Lancaster2 UK Ozone Radiometer', '7612': 'Haystack Ridge Ozone Radiometer', '7613': 'Haystack NUC3 8-channel Ozone Radiometer', '7614': 'Fairbanks Ozone Radiometer', '8001': 'South Pole Scintillation Receiver', '8000': 'World-wide GNSS Receiver Network', '8002': 'McMurdo Scintillation Receiver', '8010': 'GNSS Scintillation Network', '3010': 'Davis Czerny-Turner Scanning Spectrophotometer', '3320': 'Wuppertal (DE) Czerny-Turner OH Grating Spectrometer', '4470': 'Poker Flat 4 Channel Filter Photometer', '4473': 'Fort Yukon 4 Channel Filter Photometer', '4480': 'Arecibo red line photometer', '4481': 'Arecibo green line photometer', '7191': 'USU Mesospheric Temperature Mapper'} if pandas_format is None: inst_codes = dict(**time_series, **multi_dim) elif pandas_format: inst_codes = time_series else: inst_codes = multi_dim return inst_codes
[docs] def madrigal_file_format_str(inst_code, strict=False, verbose=True): """Supply known Madrigal instrument codes with a brief description. Parameters ---------- inst_code : int Madrigal instrument code as an integer strict : bool If True, returns only file formats that will definitely not have a problem being parsed by pysat. If False, will return any file format. (default=False) verbose : bool If True raises logging warnings, if False does not log any warnings. (default=True) Returns ------- fstr : str File formatting string that may or may not be parsable by pysat Raises ------ ValueError If file formats with problems would be returned and `strict` is True. Note ---- File strings that have multiple '*' wildcards typically have several experiment types and require a full pysat Instrument to properly manage these types. """ if not isinstance(inst_code, int): inst_code = int(inst_code) format_str = { 120: 'imf{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 210: 'geo{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 211: 'aei{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 212: 'dst{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 170: 'pfx{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 180: 'dmp{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 8100: 'dms*_{{year:04d}}{{month:02d}}{{day:02d}}_*.{{version:03d}}.', 8105: 'van_allen_{{year:04d}}_{{month:02d}}.{{version:03d}}.', 8400: '???{{year:04d}}{{month:02d}}{{day:02d}}j*.{{version:03d}}.', 8250: 'jic{{year:04d}}{{month:02d}}{{day:02d}}_mag.{{version:03d}}.', 8255: 'pmt*.', 8300: 'smt*.', 7800: 'gbt{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 10: 'jro{{year:04d}}{{month:02d}}{{day:02d}}*.{{version:03d}}.', 20: 'aro*{{year:02d}}{{month:02d}}{{day:02d}}a.{{version:03d}}.', 21: 'aro*{{year:02d}}{{month:02d}}{{day:02d}}*g.{{version:03d}}.', 22: 'ar?*{{year:02d}}{{month:02d}}{{day:02d}}*.{{version:03d}}.', 25: 'mui{{year:02d}}{{month:02d}}{{day:02d}}?.{{version:03d}}.', 30: 'mlh{{year:02d}}{{month:02d}}{{day:02d}}?.{{version:03d}}.', 31: 'mlh{{year:02d}}{{month:02d}}{{day:02d}}?.{{version:03d}}.', 32: 'mlh{{year:02d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 40: 'sts{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 41: 'sts{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 42: 'sts{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 43: 'sts{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 45: 'kha{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 50: 'cht{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 53: 'ist{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 57: 'mlv{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 61: 'pfa{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 70: 'MAD????_{year:04d}}-{{month:02d}}-{{day:02d}}_*.', 71: 'MAD????_{year:04d}}-{{month:02d}}-{{day:02d}}_*@kir.', 72: 'MAD????_{year:04d}}-{{month:02d}}-{{day:02d}}_*@uhf.', 73: 'MAD????_{year:04d}}-{{month:02d}}-{{day:02d}}_*@sod.', 74: 'MAD????_{year:04d}}-{{month:02d}}-{{day:02d}}_*@vhf.', 75: 'MAD????_{year:04d}}-{{month:02d}}-{{day:02d}}_*@vkrv*.', 76: 'MAD????_{year:04d}}-{{month:02d}}-{{day:02d}}_*@sdv*.', 80: 'son{{year:02d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 85: 'ALT{{year:02d}}{{month:02d}}{{day:02d}}_*.', 91: 'ran{{year:02d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 92: 'ras{{year:02d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 95: 'MAD????_{year:04d}}-{{month:02d}}-{{day:02d}}_*@esr.', 100: 'MAD????_{year:04d}}-{{month:02d}}-{{day:02d}}_*@quj.', 310: 'gcm*.', 311: 'ami*.', 312: 'tdi*.', 320: 'sdt*.', 321: 'sdl*.', 322: 'gsw*.', 820: 'hhf*.', 830: 'syf*.', 845: 'khf*.', 861: 'shf*.', 870: 'gbf*.', 900: 'fhf*.', 910: 'whf*.', 911: 'ehf*.', 1040: 'arm{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1140: 'pkr{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1180: 'ssr*.', 1210: 'sbf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1215: 'dav{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1220: 'maf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1221: 'rth{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1230: 'ccf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1240: 'adf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1245: 'rtg{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1254: 'tyr{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1270: 'kau{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1275: 'yam{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1285: 'plr{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1310: 'wak{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1320: 'cof{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1340: 'saf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1375: 'rpk{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1390: 'trf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1395: 'sym_{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 1400: 'hmf_{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 13: 'D{{year:04d}}{{month:02d}}*.', 1539: 'asc{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1540: 'rmr_{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 1560: 'atm{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1620: 'dum{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 1750: 'obn{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1775: 'emr{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 1780: 'wmr*.', 1781: 'mmr*.', 1782: 'bmr*.', 1783: 'smr*.', 1784: 'som{{year:02d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 1785: 'amr{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 1786: 'cpr_{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 1787: 'bpr_{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 1788: 'kgr_{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 1790: 'ame*.', 1791: 'sco*.', 1792: 'lcm*.', 1793: 'alo{{year:04d}}{{month:02d}}{{day:02d}}_{{version:03d}}.', 2090: 'cia{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 2200: 'pla{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 2550: 'uld*.', 2890: 'ssd{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 2900: 'sdi*.', 2930: 'qad{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 2950: 'trd*.', 2951: 'lrd*.', 2952: 'krd*.', 5000: 'spf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5005: 'pfi{{year:04d}}{{month:02d}}{{day:02d}}.', 5015: 'ahf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5020: 'hfp{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5060: 'mjf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5140: 'aqf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5145: 'jfp{{year:04d}}{{month:02d}}{{day:02d}}_*.{{version:03d}}.', 5150: 'mfp{{year:04d}}{{month:02d}}{{day:02d}}_*.{{version:03d}}.', 5160: 'afp{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5190: 'kha{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5240: 'fpf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5292: 'aaf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5300: 'pfp{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5340: 'mfp{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.?.', 5360: 'kfp{{year:02d}}{{month:02d}}{{day:02d}}g*.', 5370: 'aif{{year:02d}}{{month:02d}}{{day:02d}}g*.', 5380: 'clf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5430: 'wfp{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5460: 'cfp{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5465: 'pkf{{year:02d}}{{month:02d}}{{day:02d}}*.', 5470: 'FYU{{year:04d}}{{month:02d}}{{day:02d}}.', 5475: 'PKZ{{year:04d}}{{month:02d}}{{day:02d}}.', 5480: 'sfp{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5510: 'ikf{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5535: 'rfp{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5540: 'tfp{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5545: ''.join(['minime01_car_{{year:04d}}{{month:02d}}{{day:02d}}.', 'cedar.{{version:03d}}.']), 5546: ''.join(['minime02_caj_{{year:04d}}{{month:02d}}{{day:02d}}.', 'cedar.{{version:03d}}.']), 5547: ''.join(['minime06_par_{{year:04d}}{{month:02d}}{{day:02d}}.', 'cedar.{{version:03d}}.']), 5548: ''.join(['minime02_uao_{{year:04d}}{{month:02d}}{{day:02d}}.', 'cedar.{{version:03d}}.']), 5549: 'Kirtland Airforce Base FPI', 5550: ''.join(['minime09_vti_{{year:04d}}{{month:02d}}{{day:02d}}.', 'cedar.{{version:03d}}.']), 5551: ''.join(['minime08_ann_{{year:04d}}{{month:02d}}{{day:02d}}.', 'cedar.{{version:03d}}.']), 5552: 'Merihill Peru FPI', 5553: 'Nazca Peru FPI', 5554: ''.join(['minime07_euk_{{year:04d}}{{month:02d}}{{day:02d}}.', 'cedar.{{version:03d}}.']), 5600: 'jbs_{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 5700: 'spm{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5720: 'dbm{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5860: 'stm{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5900: 'sfm{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5950: 'rbm{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 5980: 'eum{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 6205: 'akl{{year:02d}}{{month:02d}}{{day:02d}}g.*.', 6206: 'asl{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 6300: 'uil{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 6320: 'Colorado State sodium lidar', 6330: 'usl{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 6340: 'alo*.', 6350: 'nlo*.', 6360: 'cul*.', 6370: 'unl*.', 6380: 'pfl{{year:04d}}{{month:02d}}{{day:02d}}_{{cycle:03d}}.', 7190: 'usi*.', 7192: 'amp{{year:02d}}{{month:02d}}{{day:02d}}?.{{version:03d}}.', 7200: 'mhi{{year:04d}}{{month:02d}}{{day:02d}}.{kindat}.', 7201: 'aai{{year:04d}}{{month:02d}}{{day:02d}}.{kindat}.', 7202: 'abi{{year:04d}}{{month:02d}}{{day:02d}}.{kindat}.', 7203: 'eai{{year:04d}}{{month:02d}}{{day:02d}}.{kindat}.', 7204: 'mai{{year:04d}}{{month:02d}}{{day:02d}}.{kindat}.', 7205: 'rai{{year:04d}}{{month:02d}}{{day:02d}}.{kindat}.', 7206: 'jci{{year:04d}}{{month:02d}}{{day:02d}}.{kindat}.', 7240: 'mhi*.', 7580: 'qac*.', 11: 'j??*{{year:02d}}{{month:02d}}{{day:02d}}g.{{version:03d}}.', 840: 'jul{{year:04d}}{{month:02d}}{{day:02d}}_esf.{{version:03d}}.', 3000: 'utx*.', 8001: '????_?_??.gps_all.out.', 8000: '*{{year:02d}}{{month:02d}}{{day:02d}}*.{{version:03d}}.', 8002: '????_?_??.gps_all.out.', 8010: 'scin_{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 3010: 'dvs{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 3320: 'wup{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 4470: 'p4p{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 4473: 'y4p{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.', 4480: 'arp{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 4481: 'agp{{year:04d}}{{month:02d}}{{day:02d}}.{{version:03d}}.', 7191: 'mtm{{year:02d}}{{month:02d}}{{day:02d}}g.{kindat}.'} # Warn if file format not available msg = "" if inst_code not in format_str.keys(): msg = "".join(["file format string not available for ", "instrument code {:d}: ".format(inst_code)]) fstr = "*." else: fstr = format_str[inst_code] # Warn if file format has multiple '*' wildcards num_wc = len(fstr.split("*")) if num_wc >= 3: msg = "".join(["file format string has multiple '*' ", "wildcards, may not be parsable by pysat"]) elif fstr.find('{{year') < 0 and fstr != "*.": msg = "".join(["file format string missing date info, ", "may not be parsable by pysat"]) elif num_wc > 1: nspec_sec = 0 for fsplit in fstr.split("*"): if fsplit.find("}}") > 0 and fsplit.find("{{") >= 0: nspec_sec += 1 if nspec_sec > 1: msg = "".join(["file format string has '*' between formatting", " constraints, may not be parsable by pysat"]) if len(msg) > 0: if strict: raise ValueError(msg) elif verbose: pysat.logger.warning(msg) fstr += "{file_type}" return fstr
[docs] def sort_file_formats(fnames): """Separate filenames by file format type. Parameters ---------- fnames : array-like Iterable of filename strings, full path, to data files to be loaded. This input is nominally provided by pysat itself. Returns ------- load_file_types : dict A dictionary with file types as keys and a list of filenames for each file type. """ # Sort the files by file format type load_file_types = {ftype: [] for ftype in file_types.keys()} for fname in fnames: for ftype in file_types.keys(): if fname.find(ftype) > 0: load_file_types[ftype].append(fname) break if fname not in load_file_types[ftype]: # Raise a logger warning if a file with an unknown extension # is encountered pysat.logger.warning( 'file with unknown file type: {:}'.format(fname)) return load_file_types
[docs] def update_meta_with_hdf5(file_ptr, meta): """Get meta data from a Madrigal HDF5 file. Parameters ---------- file_ptr : h5py._hl.files.File Pointer to an open HDF5 file meta : pysat.Meta Existing Meta class to be updated Returns ------- file_labels : list List of metadata available """ # Get the Madrigal metadata file_meta = file_ptr['Metadata']['Data Parameters'] file_labels = list() # Load available info into pysat.Meta if not already present for item in file_meta: name_string = item[0].decode('UTF-8') unit_string = item[3].decode('UTF-8') desc_string = item[1].decode('UTF-8') file_labels.append(name_string) # Only update metadata if necessary if name_string.lower() not in meta: meta[name_string.lower()] = {meta.labels.name: name_string, meta.labels.units: unit_string, meta.labels.desc: desc_string} # Add additional metadata notes. Custom attributes attached to meta are # attached to the MetaHeader object later for key in file_ptr['Metadata']: if key != 'Data Parameters': setattr(meta, key.replace(' ', '_'), file_ptr['Metadata'][key][:]) return file_labels
[docs] def build_madrigal_datetime_index(mad_data): """Create a datetime index using standard Madrigal variables. Parameters ---------- mad_data : pds.DataFrame Madrigal data, expects time variables 'year', 'month', 'day', 'hour', 'min', and 'sec' Returns ------- data_time : Datetime index for use by pysat Raises ------ ValueError If expected time variables are missing """ # Set the standard time keys time_keys = np.array(['year', 'month', 'day', 'hour', 'min', 'sec']) # Construct datetime index from times if not np.all([key in mad_data.columns for key in time_keys]): time_keys = [key for key in time_keys if key not in mad_data.columns] raise ValueError(' '.join(["unable to construct time index, missing ", repr(time_keys)])) # Get the UT seconds of day and build the datetime index uts = 3600.0 * mad_data.loc[:, 'hour'] + 60.0 * mad_data.loc[:, 'min'] \ + mad_data.loc[:, 'sec'] data_time = pysat.utils.time.create_datetime_index( year=mad_data.loc[:, 'year'], month=mad_data.loc[:, 'month'], day=mad_data.loc[:, 'day'], uts=uts) return data_time
[docs] def convert_pandas_to_xarray(xarray_coords, data, time_ind): """Convert Madrigal HDF5/simple data from pandas to xarray. Parameters ---------- xarray_coords : list or NoneType List of keywords to use as coordinates if xarray output is desired instead of a Pandas DataFrame. Can build an xarray Dataset that have different coordinate dimensions by providing a dict inside the list instead of coordinate variable name strings. Each dict will have a tuple of coordinates as the key and a list of variable strings as the value. Empty list if None. For example, xarray_coords=[{('time',): ['year', 'doy'], ('time', 'gdalt'): ['data1', 'data2']}]. (default=None) data : pds.DataFrame Data to be converted into the xarray format time_ind : pds.DatetimeIndex or NoneType Time index for the data or None for no time index Returns ------- data : xr.Dataset Data in the dataset format. """ # If a list was provided, recast as a dict and grab the data columns if not isinstance(xarray_coords, dict): xarray_coords = {tuple(xarray_coords): [col for col in data.columns if col not in xarray_coords]} # Determine the order in which the keys should be processed: # Greatest to least number of dimensions len_dict = {len(xcoords): xcoords for xcoords in xarray_coords.keys()} coord_order = [len_dict[xkey] for xkey in sorted( [lkey for lkey in len_dict.keys()], reverse=True)] # Append time to the data frame, if provided if time_ind is not None: data = data.assign(time=pds.Series(time_ind, index=data.index)) # Cycle through each of the coordinate dimensions xdatasets = list() for xcoords in coord_order: if data.empty: break elif not np.all([xkey.lower() in data.columns for xkey in xcoords]): raise ValueError(''.join(['unknown coordinate key in [', repr(xcoords), '], use only: ', repr(data.columns)])) elif not np.all([xkey.lower() in data.columns for xkey in xarray_coords[xcoords]]): good_ind = [i for i, xkey in enumerate(xarray_coords[xcoords]) if xkey.lower() in data.columns] if len(good_ind) == 0: raise ValueError('All data variables {:} are unknown.'.format( xarray_coords[xcoords])) elif len(good_ind) < len(xarray_coords[xcoords]): # Remove the coordinates that aren't present. temp = np.array(xarray_coords[xcoords])[good_ind] # Warn user, some of this may be due to a file format change. bad_ind = [i for i in range(len(xarray_coords[xcoords])) if i not in good_ind] pysat.logger.warning( 'unknown data variable(s) {:}, using only: {:}'.format( np.array(xarray_coords[xcoords])[bad_ind], temp)) # Assign good data as a list. xarray_coords[xcoords] = list(temp) # Select the desired data values sel_data = data[list(xcoords) + xarray_coords[xcoords]] # Remove duplicates before indexing, to ensure data with the same values # at different locations are kept sel_data = sel_data.drop_duplicates() # Set the indices sel_data = sel_data.set_index(list(xcoords)) # Recast as an xarray xdatasets.append(sel_data.to_xarray()) # Get the necessary information to test the data lcols = data.columns len_data = len(lcols) # Merge all of the datasets if len(xdatasets) > 0: data = xr.merge(xdatasets) test_variables = [xkey for xkey in data.variables.keys()] ltest = len(test_variables) # Test to see that all data was retrieved if ltest != len_data: if ltest < len_data: estr = 'missing: {:}'.format(' '.join([ dvar for dvar in lcols if dvar not in test_variables])) else: estr = 'have extra: {:}'.format(' '.join([ tvar for tvar in test_variables if tvar not in lcols])) raise ValueError(''.join([ 'coordinates not supplied for all data columns', ': {:d} != {:d}; '.format(ltest, len_data), estr])) else: # Return an empty object data = xr.Dataset() return data
[docs] def load(fnames, tag='', inst_id='', xarray_coords=None): """Load data from Madrigal into Pandas or XArray. Parameters ---------- fnames : array-like Iterable of filename strings, full path, to data files to be loaded. This input is nominally provided by pysat itself. tag : str Tag name used to identify particular data set to be loaded. This input is nominally provided by pysat itself and is not used here. (default='') inst_id : str Instrument ID used to identify particular data set to be loaded. This input is nominally provided by pysat itself, and is not used here. (default='') xarray_coords : list or NoneType List of keywords to use as coordinates if xarray output is desired instead of a Pandas DataFrame. Can build an xarray Dataset that have different coordinate dimensions by providing a dict inside the list instead of coordinate variable name strings. Each dict will have a tuple of coordinates as the key and a list of variable strings as the value. Empty list if None. For example, xarray_coords=[{('time',): ['year', 'doy'], ('time', 'gdalt'): ['data1', 'data2']}]. (default=None) Returns ------- data : pds.DataFrame or xr.Dataset A pandas DataFrame or xarray Dataset holding the data from the file meta : pysat.Meta Metadata from the file, as well as default values from pysat Raises ------ ValueError If data columns expected to create the time index are missing or if coordinates are not supplied for all data columns. Note ---- Currently HDF5 reading breaks if a different file type was used previously This routine is called as needed by pysat. It is not intended for direct user interaction. """ # Test and sort the file formats load_file_types = sort_file_formats(fnames) # Initialize xarray coordinates, if needed if xarray_coords is None: xarray_coords = [] # Initialize the output meta = pysat.Meta() labels = [] data = None # Load the file data for netCDF4 files if len(load_file_types["netCDF4"]) == 1: # Xarray natively opens netCDF data into a Dataset file_data = xr.open_dataset(load_file_types["netCDF4"][0], engine="netcdf4") elif len(load_file_types["netCDF4"]) > 1: file_data = xr.open_mfdataset(load_file_types["netCDF4"], combine='by_coords', engine="netcdf4") if len(load_file_types["netCDF4"]) > 0: # Currently not saving file header data, as all metadata is at # the data variable level. The attributes are only saved if they occur # in all of the loaded files. if 'catalog_text' in file_data.attrs: notes = file_data.attrs['catalog_text'] else: notes = "No catalog text" # Get the coordinate and data variable names meta_items = [dkey for dkey in file_data.data_vars.keys()] meta_items.extend([dkey for dkey in file_data.coords.keys()]) for item in meta_items: # Set the meta values for the expected labels meta_dict = {meta.labels.name: item, meta.labels.fill_val: np.nan, meta.labels.notes: notes} for key, label in [('units', meta.labels.units), ('description', meta.labels.desc)]: if key in file_data[item].attrs.keys(): meta_dict[label] = file_data[item].attrs[key] else: meta_dict[label] = '' meta[item.lower()] = meta_dict # Remove any metadata from xarray file_data[item].attrs = {} # Reset UNIX timestamp as datetime and set it as an index file_data = file_data.rename({'timestamps': 'time'}) time_data = pds.to_datetime(file_data['time'].values, unit='s') data = file_data.assign_coords({'time': ('time', time_data)}) # Load the file data for HDF5 files if len(load_file_types["hdf5"]) > 0 or len(load_file_types["simple"]) > 0: # Ensure we don't try to create an xarray object with only time as # the coordinate coord_len = len(xarray_coords) if 'time' in xarray_coords: coord_len -= 1 # Cycle through all the filenames fdata = [] fnames = list(load_file_types["hdf5"]) fnames.extend(load_file_types["simple"]) for fname in fnames: # Open the specified file if fname in load_file_types["simple"]: # Get the gzipped text data with gzip.open(fname, 'rb') as fin: file_data = fin.readlines() # Load available info into pysat.Meta if this is the first file header = [item.decode('UTF-8') for item in file_data.pop(0).split()] if len(labels) == 0: for item in header: labels.append(item) # Only update metadata if necessary if item.lower() not in meta: meta[item.lower()] = {meta.labels.name: item} # Construct a dict of the output file_dict = {item.lower(): list() for item in header} for line in file_data: for i, val in enumerate(line.split()): file_dict[header[i].lower()].append(float(val)) # Load data into frame, with labels from metadata ldata = pds.DataFrame.from_dict(file_dict) else: # Open the specified file and get the data and metadata filed = h5py.File(fname, 'r') file_data = filed['Data']['Table Layout'] new_labels = update_meta_with_hdf5(filed, meta) if len(labels) == 0: labels = new_labels # Load data into frame, with labels from metadata ldata = pds.DataFrame.from_records(file_data, columns=labels) # Enforce lowercase variable names ldata.columns = [item.lower() for item in ldata.columns] # Extended processing is the same for simple and HDF5 files # # Construct datetime index from times time = build_madrigal_datetime_index(ldata) # Declare index or recast as xarray if coord_len > 0: ldata = convert_pandas_to_xarray(xarray_coords, ldata, time) else: # Set the index to time ldata.index = time # Raise a logging warning if there are duplicate times. This # means the data should be stored as an xarray Dataset if np.any(time.duplicated()): pysat.logger.warning(''.join([ "duplicated time indices, consider specifing ", "additional coordinates and storing the data as an ", "xarray Dataset"])) # Compile a list of the data objects fdata.append(ldata) # Merge the data together, accounting for potential netCDF output if data is None and len(fdata) == 1: data = fdata[0] else: if coord_len > 0: if data is None: data = xr.merge(fdata) else: data = xr.combine_by_coords([data, xr.merge(fdata)]) else: if data is None: data = pds.concat(fdata) data = data.sort_index() else: ldata = pds.concat(fdata).sort_index().to_xarray() ldata = ldata.rename({'index': 'time'}) data = xr.combine_by_coords([data, ldata]).to_pandas() # Ensure that data is at least an empty Dataset if data is None: if len(xarray_coords) > 0: data = xr.Dataset() else: data = pds.DataFrame(dtype=np.float64) return data, meta
[docs] def download(date_array, inst_code=None, kindat=None, data_path=None, user=None, password=None, url="http://cedar.openmadrigal.org", file_type='hdf5', **kwargs): """Download data from Madrigal. Parameters ---------- date_array : array-like list of datetimes to download data for. The sequence of dates need not be contiguous. inst_code : str Madrigal instrument code(s), cast as a string. If multiple are used, separate them with commas. (default=None) kindat : str Experiment instrument code(s), cast as a string. If multiple are used, separate them with commas. (default=None) data_path : str Path to directory to download data to. (default=None) user : str User string input used for download. Provided by user and passed via pysat. If an account is required for dowloads this routine here must error if user not supplied. (default=None) password : str Password for data download. (default=None) url : str URL for Madrigal site (default='http://cedar.openmadrigal.org') file_type : str File format for Madrigal data. Load routines currently only accepts 'hdf5' and 'netCDF4', but any of the Madrigal options may be used here. (default='hdf5') **kwargs : dict Additional kwarg catch, allows general use when tag/inst_id are not needed for a given instrument. Raises ------ ValueError If the specified input type or Madrigal experiment codes are unknown Note ---- The user's names should be provided in field user. Ruby Payne-Scott should be entered as Ruby+Payne-Scott The password field should be the user's email address. These parameters are passed to Madrigal when downloading. The affiliation field is set to pysat to enable tracking of pysat downloads. """ if file_type not in file_types.keys(): raise ValueError("Unknown file format {:}, accepts {:}".format( file_type, file_types.keys())) _check_madrigal_params(inst_code=inst_code, user=user, password=password) if kindat is None: raise ValueError("Must supply Madrigal experiment code") # Get the list of desired remote files start = date_array.min() stop = date_array.max() if start == stop: stop = date_array.shift().max() # Initialize the connection to Madrigal pysat.logger.info('Connecting to Madrigal') web_data = madrigalWeb.MadrigalData(url) pysat.logger.info('Connection established.') files = get_remote_filenames(inst_code=inst_code, kindat=kindat, user=user, password=password, web_data=web_data, url=url, start=start, stop=stop) for mad_file in files: # Build the local filename local_file = os.path.join(data_path, os.path.basename(mad_file.name)) if local_file.find(file_type) <= 0: split_file = local_file.split(".") split_file[-1] = file_type local_file = ".".join(split_file) if not os.path.isfile(local_file): fstr = ''.join(('Downloading data for ', local_file)) pysat.logger.info(fstr) web_data.downloadFile(mad_file.name, local_file, user, password, "pysat", format=file_type) else: estr = ''.join((local_file, ' already exists. Skipping.')) pysat.logger.info(estr) return
[docs] def get_remote_filenames(inst_code=None, kindat='', user=None, password=None, web_data=None, url="http://cedar.openmadrigal.org", start=dt.datetime(1900, 1, 1), stop=dt.datetime.now(), date_array=None): """Retrieve the remote filenames for a specified Madrigal experiment. Parameters ---------- inst_code : str or NoneType Madrigal instrument code(s), cast as a string. If multiple are used, separate them with commas. (default=None) kindat : str Madrigal experiment code(s), cast as a string. If multiple are used, separate them with commas. If not supplied, all will be returned. (default='') data_path : str or NoneType Path to directory to download data to. (default=None) user : str or NoneType User string input used for download. Provided by user and passed via pysat. If an account is required for dowloads this routine here must error if user not supplied. (default=None) password : str or NoneType Password for data download. (default=None) web_data : MadrigalData or NoneType Open connection to Madrigal database or None (will initiate using url) (default=None) url : str URL for Madrigal site (default='http://cedar.openmadrigal.org') start : dt.datetime or NoneType Starting time for file list, None reverts to default (default=dt.datetime(1900, 1, 1)) stop : dt.datetime or NoneType Ending time for the file list, None reverts to default (default=dt.datetime.utcnow()) date_array : dt.datetime or NoneType Array of datetimes to download data for. The sequence of dates need not be contiguous and will be used instead of start and stop if supplied. (default=None) Returns ------- files : madrigalWeb.madrigalWeb.MadrigalExperimentFile Madrigal file object that contains remote experiment file data Raises ------ ValueError If unexpected date_array input is supplied Note ---- The user's names should be provided in field user. Ruby Payne-Scott should be entered as Ruby+Payne-Scott The password field should be the user's email address. These parameters are passed to Madrigal when downloading. The affiliation field is set to pysat to enable tracking of pysat downloads. """ _check_madrigal_params(inst_code=inst_code, user=user, password=password) if kindat in ['', '*']: kindat = [] else: kindat = [int(kk) for kk in kindat.split(",")] # If date_array supplied, overwrite start and stop if date_array is not None: if len(date_array) == 0: raise ValueError('unknown date_array supplied: {:}'.format( date_array)) start = date_array.min() stop = date_array.max() # If NoneType was supplied for start or stop, set to defaults if start is None: start = dt.datetime(1900, 1, 1) if stop is None: stop = dt.datetime.utcnow() # If start and stop are identical, increment if start == stop: stop += dt.timedelta(days=1) # Open connection to Madrigal if web_data is None: web_data = madrigalWeb.MadrigalData(url) # Get list of experiments for instrument from in desired range exp_list = web_data.getExperiments(inst_code, start.year, start.month, start.day, start.hour, start.minute, start.second, stop.year, stop.month, stop.day, stop.hour, stop.minute, stop.second) # Iterate over experiments to grab files for each one files = list() istr = "Found {:d} Madrigal experiments between {:s} and {:s}".format( len(exp_list), start.strftime('%d %B %Y'), stop.strftime('%d %B %Y')) pysat.logger.info(istr) for exp in exp_list: if good_exp(exp, date_array=date_array): file_list = web_data.getExperimentFiles(exp.id) if len(kindat) == 0: files.extend(file_list) else: for file_obj in file_list: if file_obj.kindat in kindat: files.append(file_obj) return files
[docs] def good_exp(exp, date_array=None): """Determine if a Madrigal experiment has good data for specified dates. Parameters ---------- exp : MadrigalExperimentFile MadrigalExperimentFile object date_array : list-like or NoneType List of datetimes to download data for. The sequence of dates need not be contiguous. If None, then any valid experiment will be assumed to be valid. (default=None) Returns ------- gflag : boolean True if good, False if bad """ gflag = False if exp.id != -1: if date_array is None: gflag = True else: exp_start = dt.date(exp.startyear, exp.startmonth, exp.startday) exp_end = (dt.date(exp.endyear, exp.endmonth, exp.endday) + dt.timedelta(days=1)) for date_val in date_array: if date_val.date() >= exp_start and date_val.date() <= exp_end: gflag = True break return gflag
[docs] def list_remote_files(tag, inst_id, inst_code=None, kindats=None, user=None, password=None, supported_tags=None, url="http://cedar.openmadrigal.org", two_digit_year_break=None, start=dt.datetime(1900, 1, 1), stop=dt.datetime.utcnow()): """List files available from Madrigal. Parameters ---------- tag : str Denotes type of file to load. Accepts strings corresponding to the appropriate Madrigal Instrument `tags`. inst_id : str Specifies the instrument ID to load. Accepts strings corresponding to the appropriate Madrigal Instrument `inst_ids`. inst_code : str or NoneType Madrigal instrument code(s), cast as a string. If multiple are used, separate them with commas. (default=None) kindats : dict Madrigal experiment codes, in a dict of dicts with inst_ids as top level keys and tags as second level keys with Madrigal experiment code(s) as values. These should be strings, with multiple codes separated by commas. (default=None) data_path : str or NoneType Path to directory to download data to. (default=None) user : str or NoneType User string input used for download. Provided by user and passed via pysat. If an account is required for downloads this routine here must error if user not supplied. (default=None) password : str or NoneType Password for data download. (default=None) supported_tags : dict or NoneType keys are inst_id, each containing a dict keyed by tag where the values file format template strings. (default=None) url : str URL for Madrigal site (default='http://cedar.openmadrigal.org') two_digit_year_break : int or NoneType If filenames only store two digits for the year, then '1900' will be added for years >= two_digit_year_break and '2000' will be added for years < two_digit_year_break. (default=None) start : dt.datetime Starting time for file list. (default=01-01-1900) stop : dt.datetime Ending time for the file list. (default=time of run) Returns ------- pds.Series A series of filenames, see `pysat.utils.files.process_parsed_filenames` for more information. Raises ------ ValueError For missing kwarg input KeyError For dictionary input missing requested tag/inst_id Note ---- The user's names should be provided in field user. Ruby Payne-Scott should be entered as Ruby+Payne-Scott The password field should be the user's email address. These parameters are passed to Madrigal when downloading. The affiliation field is set to pysat to enable tracking of pysat downloads. Examples -------- This method is intended to be set in an instrument support file at the top level using functools.partial :: list_remote_files = functools.partial(mad_meth.list_remote_files, supported_tags=supported_tags, inst_code=madrigal_inst_code, kindats=madrigal_tag) """ _check_madrigal_params(inst_code=inst_code, user=user, password=password) # Test input if supported_tags is None or kindats is None: raise ValueError('Must supply supported_tags and kindats dicts') # Raise KeyError if input dictionaries don't match the input format_str = supported_tags[inst_id][tag] kindat = kindats[inst_id][tag] # Retrieve remote file experiment list files = get_remote_filenames(inst_code=inst_code, kindat=kindat, user=user, password=password, url=url, start=start, stop=stop) filenames = [os.path.basename(file_exp.name) for file_exp in files] # Madrigal uses 'h5' for some experiments and 'hdf5' for others format_ext = os.path.splitext(format_str)[-1] if len(filenames) > 0 and format_ext == '.hdf5': file_ext = os.path.splitext(filenames[-1])[-1] if file_ext == '.h5': format_str = format_str.replace('.hdf5', '.h5') # Parse these filenames to grab out the ones we want pysat.logger.info("Parsing filenames") if format_str.find('*') < 0: stored = pysat.utils.files.parse_fixed_width_filenames(filenames, format_str) else: stored = pysat.utils.files.parse_delimited_filenames(filenames, format_str, '.') # Process the parsed filenames and return a properly formatted Series pysat.logger.info("Processing filenames") return pysat.utils.files.process_parsed_filenames(stored, two_digit_year_break)
[docs] def list_files(tag, inst_id, data_path, format_str=None, supported_tags=None, file_cadence=dt.timedelta(days=1), two_digit_year_break=None, delimiter=None, file_type=None): """Create a Pandas Series of every file for chosen Instrument data. Parameters ---------- tag : str Denotes type of file to load. Accepts strings corresponding to the appropriate Madrigal Instrument `tags`. inst_id : str Specifies the instrument ID to load. Accepts strings corresponding to the appropriate Madrigal Instrument `inst_ids`. data_path : str Path to data directory. format_str : str or NoneType User specified file format. If None is specified, the default formats associated with the supplied tags are used. (default=None) supported_tags : dict or NoneType Keys are inst_id, each containing a dict keyed by tag where the values file format template strings. (default=None) file_cadence : dt.timedelta or pds.DateOffset pysat assumes a daily file cadence, but some instrument data file contain longer periods of time. This parameter allows the specification of regular file cadences greater than or equal to a day (e.g., weekly, monthly, or yearly). (default=dt.timedelta(days=1)) two_digit_year_break : int or NoneType If filenames only store two digits for the year, then '1900' will be added for years >= two_digit_year_break and '2000' will be added for years < two_digit_year_break. If None, then four-digit years are assumed. (default=None) delimiter : str or NoneType Delimiter string upon which files will be split (e.g., '.'). If None, filenames will be parsed presuming a fixed width format. (default=None) file_type : str or NoneType File format for Madrigal data. Load routines currently accepts 'hdf5', 'simple', and 'netCDF4', but any of the Madrigal options may be used here. If None, will look for all known file types. (default=None) Returns ------- out : pds.Series A pandas Series containing the verified available files """ # Initialize the transitional variables list_file_types = file_types.keys() if file_type is None else [file_type] sup_tags = {inst_id: {tag: supported_tags[inst_id][tag]}} out_series = list() # Cycle through each requested file type, loading the requested files for ftype in list_file_types: if supported_tags[inst_id][tag].find('{file_type}') > 0: sup_tags[inst_id][tag] = supported_tags[inst_id][tag].format( file_type=file_types[ftype]) out_series.append(pysat.instruments.methods.general.list_files( tag=tag, inst_id=inst_id, data_path=data_path, format_str=format_str, supported_tags=sup_tags, file_cadence=file_cadence, two_digit_year_break=two_digit_year_break, delimiter=delimiter)) # Combine the file lists, ensuring the files are correctly ordered if len(out_series) == 1: out = out_series[0] else: out = pds.concat(out_series).sort_index() return out
[docs] def filter_data_single_date(inst): """Filter data to a single date. Parameters ---------- inst : pysat.Instrument Instrument object to which this routine should be attached Note ---- Madrigal serves multiple days within a single JRO file to counter this, we will filter each loaded day so that it only contains the relevant day of data. This is only applied if loading by date. It is not applied when supplying pysat with a specific filename to load, nor when data padding is enabled. Note that when data padding is enabled the final data available within the instrument will be downselected by pysat to only include the date specified. Examples -------- This routine is intended to be added to the Instrument nanokernel processing queue via :: inst = pysat.Instrument() inst.custom_attach(filter_data_single_date) This function will then be automatically applied to the Instrument object data on every load by the pysat nanokernel. Warnings -------- For the best performance, this function should be added first in the queue. This may be ensured by setting the default function in a pysat instrument file to this one. To do this, within platform_name.py set `preprocess` at the top level. :: preprocess = pysat.instruments.methods.madrigal.filter_data_single_date """ # Only do this if loading by date! if inst._load_by_date and inst.pad is None: # Identify times for the loaded date idx, = np.where((inst.index >= inst.date) & (inst.index < (inst.date + pds.DateOffset(days=1)))) # Downselect from all data inst.data = inst[idx] return
def _check_madrigal_params(inst_code, user, password): """Check that parameters requried by Madrigal database are passed through. Parameters ---------- inst_code : str or NoneType Madrigal instrument code(s), cast as a string. If multiple are used, separate them with commas. user : str or NoneType The user's names should be provided in field user. Ruby Payne-Scott should be entered as Ruby+Payne-Scott password : str or NoneType The password field should be the user's email address. These parameters are passed to Madrigal when downloading. Raises ------ ValueError Default values of None will raise an error. """ inst_codes = known_madrigal_inst_codes(None) if str(inst_code) not in inst_codes.keys(): raise ValueError(''.join(["Unknown Madrigal instrument code: ", repr(inst_code), ". If this is a valid ", "Madrigal instrument code, please update ", "`pysatMadrigal.instruments.methods.general", ".known_madrigal_inst_codes`."])) if not (isinstance(user, str) and isinstance(password, str)): raise ValueError(' '.join(("The madrigal database requries a username", "and password. Please input these as", "user='firstname lastname' and", "password='myname@email.address' in this", "function."))) return