Files
MachineLearningNotebooks/how-to-use-azureml/work-with-data/datadrift-tutorial/get_data.py

31 lines
1.1 KiB
Python

# import packages
import os
import pandas as pd
from calendar import monthrange
from datetime import datetime, timedelta
from azureml.core import Dataset, Datastore, Workspace
from azureml.opendatasets import NoaaIsdWeather
# get workspace and datastore
ws = Workspace.from_config()
dstore = ws.get_default_datastore()
# adjust parameters as needed
target_years = list(range(2010, 2020))
start_month = 1
# get data
for year in target_years:
for month in range(start_month, 12 + 1):
path = 'weather-data/{}/{:02d}/'.format(year, month)
try:
start = datetime(year, month, 1)
end = datetime(year, month, monthrange(year, month)[1]) + timedelta(days=1)
isd = NoaaIsdWeather(start, end).to_pandas_dataframe()
isd = isd[isd['stationName'].str.contains('FLORIDA', regex=True, na=False)]
os.makedirs(path, exist_ok=True)
isd.to_parquet(path + 'data.parquet')
except Exception as e:
print('Month {} in year {} likely has no data.\n'.format(month, year))
print('Exception: {}'.format(e))