From d3e1d2b6544d92a9ee4915b49cdf3e684099d2b5 Mon Sep 17 00:00:00 2001 From: Boya Yu Date: Wed, 20 Sep 2017 11:31:05 -0400 Subject: [PATCH 1/2] Fix critical bug in weather scraper --- bpeng/weather/weather.py | 295 ++++++++++----------------------------- 1 file changed, 70 insertions(+), 225 deletions(-) diff --git a/bpeng/weather/weather.py b/bpeng/weather/weather.py index a7e0994..78008ce 100644 --- a/bpeng/weather/weather.py +++ b/bpeng/weather/weather.py @@ -1,7 +1,8 @@ """ Weather scraper for wunderground.com """ -from datetime import date, datetime, timedelta +from datetime import date, timedelta from urllib.request import urlopen +import json import numpy as np import pandas as pd @@ -10,9 +11,7 @@ class WeatherUnderground: """ Class for parsing weather data from 'wunderground.com' This version only for years after 2000 - Args: - period (str): year | period, optional, default 'year' If 'year', get the weather data for one single year If 'period', get the weather data within an particular period @@ -20,36 +19,32 @@ class WeatherUnderground: startdate (str): Startdate of which data is required in format year/month/day. Ignored when period = 'year' enddate (str): Enddate of which data is required. Ignored when period = 'year'. - location (str): Location or airport where data is required - i.e. KCHA for Chattanooga, TN - KNYC for New York location, NY + city (str): City where data is required. Default is 'New_York' + state (str): State where data is required. Default is 'NY' Attributes: - day_list (list): A list of all the dates (str) within the period - Examples: - - >>> wp = WeatherUnderground(period='period',startdate='2016/3/1', - enddate='2016/3/3', location='KNYC') + >>> wp = WeatherUnderground(period='period', startdate='2017/8/1', + enddate='2017/8/2') >>> wp.get_weather() >>> print(wp.output_temperature()) date temperature - 0 2016/03/01 45.63 - 1 2016/03/02 41.95 - 2 2016/03/03 31.00 - - >>> wp = WeatherUnderground(period='period',startdate='2014/1/1', - enddate='2014/1/2', location='KCHA') - >>> wp.get_weather(detail=True) - >>> print(wp.output_detail()) + 0 2017/08/01 80.59 + 1 2017/08/02 75.85 + >>> wp = WeatherUnderground(period='period', startdate='2014/1/1', + enddate='2014/1/2', city='Chattanooga', state='TN') + >>> wp.get_weather() + >>> print(wp.output_daily_data()) temperature humidity wind rain 2014/01/01 36.68 79.0 1.790000 0.00 2014/01/02 38.63 79.0 10.028205 0.43 """ # pylint: disable=too-many-instance-attributes,too-many-arguments - WUNDER_URL = 'https://www.wunderground.com/history/airport/{}/{}/DailyHistory.html?format=1' + # WUNDER_URL = 'https://www.wunderground.com/history/airport/{}/{}/DailyHistory.html?format=1' + # url above is deprecated + WUNDER_JSON = 'http://api.wunderground.com/api/cfc5106fcb7366d8/history_{}/q/{}/{}.json' MIN_ALLOWED_TEMPERATURE = -100 # The minimum allowed value for temperature temperature = None humidity = None @@ -65,10 +60,17 @@ class WeatherUnderground: year=2014, startdate='sd', enddate='ed', - location='KNYC' + city='New_York', + state='NY' ): self.year = year - self.location = location + self.city = city + self.state = state + + self.temperature = [] + self.humidity = [] + self.wind = [] + self.rain = [] if period == 'year': start_day = date(year, 1, 1) @@ -94,240 +96,93 @@ class WeatherUnderground: def get_date_weather(self, day): """ Get the weather for a given date - Args: - day (string): The date to get weather for - Returns: - - list: A 2D list where each entry is a list of features for a timestamp + pd.DataFrame: a dataframe with weather data """ - url = self.WUNDER_URL.format(self.location, day) - html = urlopen(url).read().decode('utf-8') - # Seperate html by new lines, each entry is a timestamp with data - html_lines = html.split('\n')[2:-1] - # 2D list where each entry is a list of features for a given timestamp - html_features = [line.split(',')[:-1] for line in html_lines] - delete_list = [] - - # Loop through all of the features, removing ones with invalid temperature - for entry in html_features: - try: - if float(entry[1]) < self.MIN_ALLOWED_TEMPERATURE: - delete_list.append(entry) - # pylint: disable=broad-except,unused-variable - except Exception as err: - delete_list.append(entry) - # pylint: enable=broad-except,unused-variable - for entry in delete_list: - html_features.remove(entry) - return html_features + url = self.WUNDER_JSON.format(day.replace('/', ''), self.state, self.city) + data = json.loads(urlopen(url).read().decode()) + data = data['history']['observations'] + features = pd.DataFrame(data)[['date', 'tempi', 'hum', 'precipi', 'wspdi']].dropna() + features['hour'] = features['date'].apply(lambda x: x['hour']) + features = features[features['tempi'].astype(float) >= self.MIN_ALLOWED_TEMPERATURE] + return features @staticmethod - def convert_time_format(prev_date): - """ - Convert a date time object from 12 hr format (with am and pm) - to 24 hr format - - Args: - - prev_date (list): List with the 0th index in 12 hour - format and the 1st index as a string representing temperature - - Returns: - - list: A new list with the 0th index in 24 hour - format and the 1st index as a float representing temperature - """ - # Make a copy of the date object - new_date = prev_date[:] - # Convert to 24 hr format - new_hour = datetime.strptime(new_date[0], "%I:%M %p").strftime("%H") - new_date[0] = int(new_hour) - # Convert the temperature to a float from a string - new_date[1] = float(new_date[1]) - return new_date - - @staticmethod - # pylint: disable=too-many-locals def add_missing_temperature(hour_temperature_list): """ Interpolate missing temperatures for hours from the temperatures around them - Args: - - hour_temperature_list (list): A 2D list where each entry is a list containing exactly - 2 entries. 0th index is the hour, 1st index is the - temperature for that hour + hour_temperature_list (pandas.DataFrame): + A pandas Dataframe with two columns + First column is temperature, and second is hour Returns: - - list: A 2D list where each entry is a list containing exactly - 24 entries. Each entry is a 2D list with the 0th index being the - hour and the 1st index being the temperature + numpy.array: An array of 24 temperature data for each hour """ - return_list = [] - - # A 1D list that contains only the hours - hour_list = [i[0] for i in hour_temperature_list] - # No repeated entries - hour_set = set(hour_list) - - # There are sometimes multiple entries for a single hour, - # here we take the average of those entries - for hour in hour_set: - # Only average the temperatures that are for this hour - avg_temperature = np.mean( - [entry[1] for entry in hour_temperature_list if entry[0] == hour] - ) - return_list.append([int(hour), avg_temperature]) - - # For interpolation purposes we need data for the 0th hour - # and for the 23rd hour. Set them to the nearest available datapoint - if 0 not in hour_set: - return_list.insert(0, [0, return_list[0][1]]) - if 23 not in hour_set: - return_list.append([23, return_list[-1][1]]) - - # Get the first entry in the list for usage in loop - prev_entry = return_list[0] - # A list to loop through so we can add new interpolated data - # into the middle of the return_list - looping_list = return_list - return_list = [prev_entry] - for entry in looping_list[1:]: - # If there is data missing between this hour and - # the previous hour, find weighted average and set it to that value - cur_hour = entry[0] - prev_hour = prev_entry[0] - # If there is a gap between these two timestamps, interpolate the new hours - if (cur_hour - prev_hour) != 1: - cur_temp = entry[1] - prev_temp = prev_entry[1] - for new_hour in range(prev_hour + 1, cur_hour): - # Math to interpolate temperature - interpolate_sum = ( - cur_temp * (prev_hour - new_hour) + - prev_temp * (new_hour - cur_hour) - ) - new_temperature = interpolate_sum / (prev_hour - cur_hour) - - new_entry = [new_hour, new_temperature] - return_list.append(new_entry) - # Set prev entry to current entry before looping back around - return_list.append(entry) - prev_entry = entry - # pylint: enable=too-many-locals - return return_list + feature = hour_temperature_list.astype(float) + feature = feature.drop_duplicates(subset='hour', keep='first') + return np.interp(range(24), feature.iloc[:, 1], feature.iloc[:, 0]) @staticmethod - def sum_of_rain(rain_data): + def average_of_feature(feature_series, min_allowed=0, max_allowed=100): """ - Sum a days worth of rain data, ignoring the - data if it's less than 0 or not an integer - + Take the average value for a given feature + (humidity or wind currently) Args: - - rain_data (list): A list where each entry is a data point for rain - + feature_series (pandas.Series): A pandas series of the feature + min_allowed (int): An integer for minimum allowed value of feature + max_allowed (int): An integer for maximum allowed value of feature Returns: - - float: Sum of all number entries greater than 0 + float: An average """ - total = 0 - for entry in rain_data: - try: - if float(entry) >= 0: - total += float(entry) - # pylint: disable=broad-except,unused-variable - except Exception as err: - pass - # pylint: enable=broad-except,unused-variable - return total + feature = feature_series.astype(float) + feature = feature[(feature >= min_allowed) & (feature <= max_allowed)] + return feature.mean() @staticmethod - def average_of_feature(feature_list): + def sum_of_rain(rain_data): """ - Take the average value for a given feature - (humidity or wind currently) - + Sum a days worth of rain data, ignoring the + data if it's less than 0 or not an integer Args: - - feature_list (list): A list where each entry is a feature - + rain_data (pandas.Series): A pandas series with data points for rain Returns: - - float: An average + float: Sum of all number entries greater than 0 """ - total = 0 - for feature in feature_list: - try: - if float(feature) >= 0: - total += float(feature) - else: - feature_list.remove(feature) - # pylint: disable=broad-except,unused-variable - except Exception as err: - feature_list.remove(feature) - # pylint: enable=broad-except,unused-variable - return total / len(feature_list) + feature = rain_data.astype(float) + feature = feature[feature > 0] + return feature.sum() - def get_weather(self, detail=False): + def get_weather(self): """ Get the weather for this object's date range. Requires about 1 second for each day - Args: - - detail (bool): Whether or not the scrape should include humidity, wind, rain - and conditions. True for detail weather and False for temperature only - Raises: ValueError: No weather data for the day """ - # pylint: disable=too-many-locals - temperature_list = [] - humidity_list = [] - wind_list = [] - rain_list = [] - condition_list = [] - - # i is a day for day in self.day_list: try: # All of the weather data for that day day_weather = self.get_date_weather(day) - # A list with the 0th index as the hour, 1st index as temperature - hour_temperature_list = [j[:2] for j in day_weather] - if detail: - # Get the data for each feature - humidity_list.append([entry[3] for entry in day_weather]) - wind_list.append([entry[7] for entry in day_weather]) - rain_list.append([entry[9] for entry in day_weather]) - condition_list.append([entry[11] for entry in day_weather]) - # Interpolate missing temperatures - new_temperature = self.add_missing_temperature( - # Convert to 24 hr format for each timestamp in one day - [self.convert_time_format(entry) for entry in hour_temperature_list] - ) - temperature_list.append(new_temperature) + hour_temp = day_weather[['tempi', 'hour']] + self.temperature.append(self.add_missing_temperature(hour_temp)) + self.humidity.append(self.average_of_feature(day_weather['hum'])) + self.wind.append(self.average_of_feature(day_weather['wspdi'])) + self.rain.append(self.sum_of_rain(day_weather['precipi'])) + except Exception as err: raise ValueError('No temperature data for {}, {}'.format(day, str(err))) - self.temperature = temperature_list - # Get the averages for each day - self.humidity = [self.average_of_feature(entry) for entry in humidity_list] - self.wind = [self.average_of_feature(entry) for entry in wind_list] - self.rain = [self.sum_of_rain(entry) for entry in rain_list] - self.condition = condition_list # Detailed dataframe with hourly temperature data dataframe_list = [] - for i in range(len(self.day_list)): + for i, datetmp in enumerate(self.day_list): for j in range(24): dataframe_list.append([ - self.day_list[i], + datetmp, j, - round(self.temperature[i][j][1], 2), + round(self.temperature[i][j], 2), ]) self.temperature_dataframe_detail = pd.DataFrame( dataframe_list, @@ -336,12 +191,9 @@ class WeatherUnderground: # For daily only dataframe_list = [] - for i in range(len(self.day_list)): - mean = np.mean([self.temperature[i][j][1] for j in range(24)]) - dataframe_list.append([ - self.day_list[i], - round(mean, 2), - ]) + for i, datetmp in enumerate(self.day_list): + mean = np.mean(self.temperature[i]) + dataframe_list.append([datetmp, round(mean, 2)]) self.temperature_dataframe_nodetail = pd.DataFrame( dataframe_list, columns=['date', 'temperature'], @@ -351,14 +203,10 @@ class WeatherUnderground: def output_temperature(self, interval='day'): """ Args: - interval (str): Options 'day', temperature for each day or 'hour', temperature for each hour. Defaults to 'day' - Returns: - pd.DataFrame: A dataframe of the temperature data - Raises: ValueError: interval should be either 'day' or 'hour' """ @@ -372,12 +220,9 @@ class WeatherUnderground: def output_daily_data(self): """ Output the data for daily - Returns: - pd.DataFrame: A dataframe of weather data for 'temperature', 'humidity', 'wind' and 'rain'. - Raises: ValueError: Detailed weather data should be scraped first """ -- GitLab From b399fbc6aceef154007eaa76cfa8488c4b3d59b6 Mon Sep 17 00:00:00 2001 From: Boya Yu Date: Wed, 20 Sep 2017 12:48:33 -0400 Subject: [PATCH 2/2] Use the API key as an argument of function instead of part of URL --- bpeng/weather/weather.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/bpeng/weather/weather.py b/bpeng/weather/weather.py index 78008ce..d2f0f74 100644 --- a/bpeng/weather/weather.py +++ b/bpeng/weather/weather.py @@ -27,14 +27,14 @@ class WeatherUnderground: Examples: >>> wp = WeatherUnderground(period='period', startdate='2017/8/1', enddate='2017/8/2') - >>> wp.get_weather() + >>> wp.get_weather(YOUR_API_KEY) >>> print(wp.output_temperature()) date temperature 0 2017/08/01 80.59 1 2017/08/02 75.85 >>> wp = WeatherUnderground(period='period', startdate='2014/1/1', enddate='2014/1/2', city='Chattanooga', state='TN') - >>> wp.get_weather() + >>> wp.get_weather(YOUR_API_KEY) >>> print(wp.output_daily_data()) temperature humidity wind rain 2014/01/01 36.68 79.0 1.790000 0.00 @@ -44,7 +44,7 @@ class WeatherUnderground: # pylint: disable=too-many-instance-attributes,too-many-arguments # WUNDER_URL = 'https://www.wunderground.com/history/airport/{}/{}/DailyHistory.html?format=1' # url above is deprecated - WUNDER_JSON = 'http://api.wunderground.com/api/cfc5106fcb7366d8/history_{}/q/{}/{}.json' + WUNDER_JSON = 'http://api.wunderground.com/api/{}/history_{}/q/{}/{}.json' MIN_ALLOWED_TEMPERATURE = -100 # The minimum allowed value for temperature temperature = None humidity = None @@ -72,6 +72,8 @@ class WeatherUnderground: self.wind = [] self.rain = [] + self.api_key = None + if period == 'year': start_day = date(year, 1, 1) end_day = date(year, 12, 31) @@ -100,10 +102,16 @@ class WeatherUnderground: day (string): The date to get weather for Returns: pd.DataFrame: a dataframe with weather data + Raises: + KeyError: API Key does not exist or exceeds limit. """ - url = self.WUNDER_JSON.format(day.replace('/', ''), self.state, self.city) + url = self.WUNDER_JSON.format(self.api_key, day.replace('/', ''), + self.state, self.city) data = json.loads(urlopen(url).read().decode()) - data = data['history']['observations'] + try: + data = data['history']['observations'] + except Exception as _: + raise KeyError('Key does not exist or exceeds limit.') features = pd.DataFrame(data)[['date', 'tempi', 'hum', 'precipi', 'wspdi']].dropna() features['hour'] = features['date'].apply(lambda x: x['hour']) features = features[features['tempi'].astype(float) >= self.MIN_ALLOWED_TEMPERATURE] @@ -155,13 +163,15 @@ class WeatherUnderground: feature = feature[feature > 0] return feature.sum() - def get_weather(self): + def get_weather(self, api_key): """ Get the weather for this object's date range. Requires about 1 second for each day - + Args: + api_key (str): API key at weather underground Raises: ValueError: No weather data for the day """ + self.api_key = api_key for day in self.day_list: try: # All of the weather data for that day -- GitLab