From da9f38f8359f8435178f0842e83d16af38b847a4 Mon Sep 17 00:00:00 2001 From: Boya Yu Date: Thu, 15 Jun 2017 16:57:27 -0400 Subject: [PATCH] Add weather underground scraper --- bpeng/weather/__init__.py | 1 + bpeng/weather/weather.py | 396 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 397 insertions(+) create mode 100644 bpeng/weather/__init__.py create mode 100644 bpeng/weather/weather.py diff --git a/bpeng/weather/__init__.py b/bpeng/weather/__init__.py new file mode 100644 index 0000000..5663f6f --- /dev/null +++ b/bpeng/weather/__init__.py @@ -0,0 +1 @@ +from .weather import WeatherUnderground diff --git a/bpeng/weather/weather.py b/bpeng/weather/weather.py new file mode 100644 index 0000000..6c9c382 --- /dev/null +++ b/bpeng/weather/weather.py @@ -0,0 +1,396 @@ +""" Weather scraper for wunderground.com """ +from datetime import date, datetime, timedelta +from urllib.request import urlopen + +import numpy as np +import pandas as pd + + +class WeatherUnderground: + """ + Class for parsing weather data from 'wunderground.com' + This version only for years after 2000 + + Args: + + period (str): year | period, optional, default 'year' + If 'year', get the weather data for one single year + If 'period', get the weather data within an particular period + year (int): Year which data is required. Ignored when period = 'period'. + startdate (str): Startdate of which data is required in format year/month/day. + Ignored when period = 'year' + enddate (str): Enddate of which data is required. Ignored when period = 'year'. + location (str): Location or airport where data is required + i.e. KCHA for Chattanooga, TN + KNYC for New York location, NY + + Attributes: + + day_list (list): A list of all the dates (str) within the period + + Examples: + + >>> wp = WeatherUnderground(period='period',startdate='2016/3/1', + enddate='2016/3/3', location='KNYC') + >>> wp.get_weather() + >>> print(wp.output_temperature()) + date temperature + 0 2016/03/01 45.63 + 1 2016/03/02 41.95 + 2 2016/03/03 31.00 + + >>> wp = WeatherUnderground(period='period',startdate='2014/1/1', + enddate='2014/1/2', location='KCHA') + >>> wp.get_weather(detail=True) + >>> print(wp.output_detail()) + temperature humidity wind rain + 2014/01/01 36.68 79.0 1.790000 0.00 + 2014/01/02 38.63 79.0 10.028205 0.43 + """ + + # pylint: disable=too-many-instance-attributes,too-many-arguments + WUNDER_URL = 'https://www.wunderground.com/history/airport/{}/{}/DailyHistory.html?format=1' + MIN_ALLOWED_TEMPERATURE = -100 # The minimum allowed value for temperature + temperature = None + humidity = None + wind = None + rain = None + condition = None + temperature_dataframe_detail = None + temperature_dataframe_nodetail = None + + def __init__( + self, + period='year', + year=2014, + startdate='sd', + enddate='ed', + location='KNYC' + ): + self.year = year + self.location = location + + if period == 'year': + start_day = date(year, 1, 1) + end_day = date(year, 12, 31) + elif period == 'period': + try: + start_day = pd.to_datetime(startdate).date() + end_day = pd.to_datetime(enddate).date() + except Exception as _: + raise ValueError('Check date time') + else: + raise ValueError("period should be 'year' or 'period'") + + delta = end_day - start_day + # Create a list with a timestamp for each day in the desired range + day_list = [] + for day_num in range(delta.days+1): + new_day = start_day + timedelta(days=day_num) + day_list.append(new_day.strftime("20%y/%m/%d")) + self.day_list = day_list + # pylint: enable=too-many-arguments + + def get_date_weather(self, day): + """ + Get the weather for a given date + + Args: + + day (string): The date to get weather for + + Returns: + + list: A 2D list where each entry is a list of features for a timestamp + """ + url = self.WUNDER_URL.format(self.location, day) + html = urlopen(url).read().decode('utf-8') + # Seperate html by new lines, each entry is a timestamp with data + html_lines = html.split('\n')[2:-1] + # 2D list where each entry is a list of features for a given timestamp + html_features = [line.split(',')[:-1] for line in html_lines] + delete_list = [] + + # Loop through all of the features, removing ones with invalid temperature + for entry in html_features: + try: + if float(entry[1]) < self.MIN_ALLOWED_TEMPERATURE: + delete_list.append(entry) + # pylint: disable=broad-except,unused-variable + except Exception as err: + delete_list.append(entry) + # pylint: enable=broad-except,unused-variable + for entry in delete_list: + html_features.remove(entry) + return html_features + + @staticmethod + def convert_time_format(prev_date): + """ + Convert a date time object from 12 hr format (with am and pm) + to 24 hr format + + Args: + + prev_date (list): List with the 0th index in 12 hour + format and the 1st index as a string representing temperature + + Returns: + + list: A new list with the 0th index in 24 hour + format and the 1st index as a float representing temperature + """ + # Make a copy of the date object + new_date = prev_date[:] + # Convert to 24 hr format + new_hour = datetime.strptime(new_date[0], "%I:%M %p").strftime("%H") + new_date[0] = int(new_hour) + # Convert the temperature to a float from a string + new_date[1] = float(new_date[1]) + return new_date + + @staticmethod + # pylint: disable=too-many-locals + def add_missing_temperature(hour_temperature_list): + """ + Interpolate missing temperatures for hours from the temperatures around them + + Args: + + hour_temperature_list (list): A 2D list where each entry is a list containing exactly + 2 entries. 0th index is the hour, 1st index is the + temperature for that hour + + Returns: + + list: A 2D list where each entry is a list containing exactly + 24 entries. Each entry is a 2D list with the 0th index being the + hour and the 1st index being the temperature + """ + return_list = [] + + # A 1D list that contains only the hours + hour_list = [i[0] for i in hour_temperature_list] + # No repeated entries + hour_set = set(hour_list) + + # There are sometimes multiple entries for a single hour, + # here we take the average of those entries + for hour in hour_set: + # Only average the temperatures that are for this hour + avg_temperature = np.mean( + [entry[1] for entry in hour_temperature_list if entry[0] == hour] + ) + return_list.append([int(hour), avg_temperature]) + + # For interpolation purposes we need data for the 0th hour + # and for the 23rd hour. Set them to the nearest available datapoint + if 0 not in hour_set: + return_list.insert(0, [0, return_list[0][1]]) + if 23 not in hour_set: + return_list.append([23, return_list[-1][1]]) + + # Get the first entry in the list for usage in loop + prev_entry = return_list[0] + # A list to loop through so we can add new interpolated data + # into the middle of the return_list + looping_list = return_list + return_list = [prev_entry] + for entry in looping_list[1:]: + # If there is data missing between this hour and + # the previous hour, find weighted average and set it to that value + cur_hour = entry[0] + prev_hour = prev_entry[0] + # If there is a gap between these two timestamps, interpolate the new hours + if (cur_hour - prev_hour) != 1: + cur_temp = entry[1] + prev_temp = prev_entry[1] + for new_hour in range(prev_hour + 1, cur_hour): + # Math to interpolate temperature + interpolate_sum = ( + cur_temp * (prev_hour - new_hour) + + prev_temp * (new_hour - cur_hour) + ) + new_temperature = interpolate_sum / (prev_hour - cur_hour) + + new_entry = [new_hour, new_temperature] + return_list.append(new_entry) + # Set prev entry to current entry before looping back around + return_list.append(entry) + prev_entry = entry + # pylint: enable=too-many-locals + return return_list + + @staticmethod + def sum_of_rain(rain_data): + """ + Sum a days worth of rain data, ignoring the + data if it's less than 0 or not an integer + + Args: + + rain_data (list): A list where each entry is a data point for rain + + Returns: + + float: Sum of all number entries greater than 0 + """ + total = 0 + for entry in rain_data: + try: + if float(entry) >= 0: + total += float(entry) + # pylint: disable=broad-except,unused-variable + except Exception as err: + pass + # pylint: enable=broad-except,unused-variable + return total + + @staticmethod + def average_of_feature(feature_list): + """ + Take the average value for a given feature + (humidity or wind currently) + + Args: + + feature_list (list): A list where each entry is a feature + + Returns: + + float: An average + """ + total = 0 + for feature in feature_list: + try: + if float(feature) >= 0: + total += float(feature) + else: + feature_list.remove(feature) + # pylint: disable=broad-except,unused-variable + except Exception as err: + feature_list.remove(feature) + # pylint: enable=broad-except,unused-variable + return total / len(feature_list) + + def get_weather(self, detail=False): + """ + Get the weather for this object's date range. Requires about 1 second for each day + + Args: + + detail (bool): Whether or not the scrape should include humidity, wind, rain + and conditions. True for detail weather and False for temperature only + + Raises: + ValueError: No weather data for the day + """ + # pylint: disable=too-many-locals + temperature_list = [] + humidity_list = [] + wind_list = [] + rain_list = [] + condition_list = [] + + # i is a day + for day in self.day_list: + try: + # All of the weather data for that day + day_weather = self.get_date_weather(day) + # A list with the 0th index as the hour, 1st index as temperature + hour_temperature_list = [j[:2] for j in day_weather] + if detail: + # Get the data for each feature + humidity_list.append([entry[3] for entry in day_weather]) + wind_list.append([entry[7] for entry in day_weather]) + rain_list.append([entry[9] for entry in day_weather]) + condition_list.append([entry[11] for entry in day_weather]) + # Interpolate missing temperatures + new_temperature = self.add_missing_temperature( + # Convert to 24 hr format for each timestamp in one day + [self.convert_time_format(entry) for entry in hour_temperature_list] + ) + temperature_list.append(new_temperature) + except Exception as err: + raise ValueError('No temperature data for {}, {}'.format(day, str(err))) + self.temperature = temperature_list + # Get the averages for each day + self.humidity = [self.average_of_feature(entry) for entry in humidity_list] + self.wind = [self.average_of_feature(entry) for entry in wind_list] + self.rain = [self.sum_of_rain(entry) for entry in rain_list] + self.condition = condition_list + + # Detailed dataframe with hourly temperature data + dataframe_list = [] + for i in range(len(self.day_list)): + for j in range(24): + dataframe_list.append([ + self.day_list[i], + j, + round(self.temperature[i][j][1], 2), + ]) + self.temperature_dataframe_detail = pd.DataFrame( + dataframe_list, + columns=['date', 'hour', 'temperature'], + ) + + # For daily only + dataframe_list = [] + for i in range(len(self.day_list)): + mean = np.mean([self.temperature[i][j][1] for j in range(24)]) + dataframe_list.append([ + self.day_list[i], + round(mean, 2), + ]) + self.temperature_dataframe_nodetail = pd.DataFrame( + dataframe_list, + columns=['date', 'temperature'], + ) + # pylint: enable=too-many-locals + + def output_temperature(self, interval='day'): + """ + Args: + + interval (str): Options 'day', temperature for each day or + 'hour', temperature for each hour. Defaults to 'day' + + Returns: + + pd.DataFrame: A dataframe of the temperature data + + Raises: + ValueError: interval should be either 'day' or 'hour' + """ + if interval == 'hour': + return self.temperature_dataframe_detail + elif interval == 'day': + return self.temperature_dataframe_nodetail + else: + raise ValueError("Choose interval parameter within 'day' and 'hour'") + + def output_daily_data(self): + """ + Output the data for daily + + Returns: + + pd.DataFrame: A dataframe of weather data for 'temperature', 'humidity', + 'wind' and 'rain'. + + Raises: + ValueError: Detailed weather data should be scraped first + """ + try: + dataframe_detail = [ + list(self.temperature_dataframe_nodetail['temperature']), + self.humidity, + self.wind, + self.rain, + ] + dataframe_detail = pd.DataFrame(dataframe_detail).T + dataframe_detail.columns = ['temperature', 'humidity', 'wind', 'rain'] + dataframe_detail.index = self.day_list + return dataframe_detail + except Exception as _: + raise ValueError("Get detail weather first") -- GitLab