Created
June 10, 2024 21:19
-
-
Save stevewasiura/41fe996fafeefafab5097681a900aec8 to your computer and use it in GitHub Desktop.
scrape html webpage of observed weather and parse precipitation values for hours 1, 3 and 6
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # 2024-06-10 | |
| # simple python code to scrape html webpage from weather.gov (requires only 1 library named "requests") | |
| # and parse values of columns "precipitation" aggregated by 1 hour, 3 hours, 6 hours | |
| # and loop thru each row of data from past 24 hours, | |
| # and sum the values to get a total rainfall amount in inches | |
| # to be used to automate the watering system of a garden | |
| import requests # http requests, needed to get http data | |
| # use requests library method to connect to website and grab data | |
| # url is for nat weather observations from Dunkirk NY | |
| NATWEATHER_KDKK_URL = "https://forecast.weather.gov/data/obhistory/KDKK.html" | |
| # used to be... "https://w1.weather.gov/data/obhistory/KDKK.html" | |
| response = requests.get(NATWEATHER_KDKK_URL) | |
| #print("http response="+ str(response.text) ) | |
| #print("") | |
| # need to check response code to make sure it was 200 (OK) | |
| if response.status_code == 200 : | |
| #intialize sum vars | |
| precip_1_hr_sum = 0 | |
| precip_3_hr_sum = 0 | |
| precip_6_hr_sum = 0 | |
| # find the data table | |
| find_str_start = '<tbody>' | |
| find_str_end = '</tbody>' | |
| # get position of this string | |
| http_table_start_position = response.text.find(find_str_start) | |
| # get closing table tag, starting from position where table opening position | |
| http_table_end_position = response.text.find(find_str_end, http_table_start_position ) | |
| # extract using slicing start:end-1 | |
| # but add 7 to include "</tbody>" | |
| http_table = response.text[http_table_start_position:http_table_end_position + 7] | |
| # find closing tr tag, by splitting the data into a list of strings | |
| tr_list = http_table.split('</tr>') | |
| # get count of list rows | |
| tr_list_count = len(tr_list) | |
| # access each list row using a zero-based index | |
| tr_list_counter = 0 | |
| # loop thru list, getting last 23 hours | |
| while tr_list_counter < tr_list_count and tr_list_counter < 24: | |
| #print( str(tr_list_counter) +": "+ str( tr_list[tr_list_counter]) ) | |
| #print(" ") | |
| # get this row into variable | |
| tr_list_row = tr_list[tr_list_counter] | |
| # find closing </td> tags, by splitting by closing tag | |
| td_list = tr_list_row.split('</td>') | |
| td_list_count = len(td_list) | |
| # count goes from 0 to 18, with 18 being blank, probably the closing tr tag | |
| td_list_counter = 0 | |
| while td_list_counter < td_list_count: | |
| # I only want the last 3 tags, but 18 is blank, | |
| # so I use 5, to get list row with index starting at 14, | |
| # bcuz list index is zero-based | |
| if (td_list_counter > (td_list_count - 5) ): | |
| #print(str(td_list_counter) +": "+ str(td_list[td_list_counter]).strip() ) | |
| #print(" ") | |
| # remove td opening tag, using slice method, starting at postion 4 | |
| # append a 0 as string to the strip, so any empty td cell will be treated as a value of 0 | |
| td_inner = td_list[td_list_counter].strip()[4:] + "0" | |
| # need to use the td_list_counter to assign this data to the proper variable | |
| #print( str(td_list_counter) +": "+ str(len(td_inner)) +": "+ str(td_inner) + "|" ) | |
| # which column (list counter) is this data point? | |
| if (td_list_counter == 15): | |
| # convert to float and add to precip 1 hr string | |
| precip_1_hr_sum = precip_1_hr_sum + float(td_inner) | |
| if (td_list_counter == 16): | |
| # convert to float and add to precip 3 hr string | |
| precip_3_hr_sum = precip_3_hr_sum + float(td_inner) | |
| if (td_list_counter == 17): | |
| # convert to float and add to precip 6 hr string | |
| precip_6_hr_sum = precip_6_hr_sum + float(td_inner) | |
| # increment td counter to grab next td cell | |
| td_list_counter = td_list_counter + 1 | |
| # increment counter to grab next row | |
| tr_list_counter = tr_list_counter +1 | |
| # print the sums | |
| print( "precip_1_hr_sum= "+ str(precip_1_hr_sum) ) | |
| print( "precip_3_hr_sum= "+ str(precip_3_hr_sum) ) | |
| print( "precip_6_hr_sum= "+ str(precip_6_hr_sum) ) | |
| else : | |
| # the http request had an error | |
| print("http error") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment