stevewasiura/weather_observed_scrape_html_parse_precipitation_hrs_1_3_6.py

## weather_observed_scrape_html_parse_precipitation_hrs_1_3_6.py
# 2024-06-10
# simple python code to scrape html webpage from weather.gov (requires only 1 library named "requests")
# and parse values of columns "precipitation" aggregated by 1 hour, 3 hours, 6 hours
# and loop thru each row of data from past 24 hours,
# and sum the values to get a total rainfall amount in inches
# to be used to automate the watering system of a garden


import requests # http requests, needed to get http data

# use requests library method to connect to website and grab data
# url is for nat weather observations from Dunkirk NY
NATWEATHER_KDKK_URL = "https://forecast.weather.gov/data/obhistory/KDKK.html"
# used to be... "https://w1.weather.gov/data/obhistory/KDKK.html"
response = requests.get(NATWEATHER_KDKK_URL)
#print("http response="+ str(response.text) )
#print("")

# need to check response code to make sure it was 200 (OK)
if response.status_code == 200 :

    #intialize sum vars
    precip_1_hr_sum = 0
    precip_3_hr_sum = 0
    precip_6_hr_sum = 0

    # find the data table
    find_str_start = '<tbody>'
    find_str_end = '</tbody>'

    # get position of this string
    http_table_start_position = response.text.find(find_str_start)
    # get closing table tag, starting from position where table opening position
    http_table_end_position = response.text.find(find_str_end, http_table_start_position )

    # extract using slicing start:end-1
    # but add 7 to include "</tbody>"
    http_table = response.text[http_table_start_position:http_table_end_position + 7]


    # find closing tr tag, by splitting the data into a list of strings
    tr_list = http_table.split('</tr>')
    # get count of list rows
    tr_list_count = len(tr_list)

    # access each list row using a zero-based index
    tr_list_counter = 0
    # loop thru list, getting last 23 hours
    while tr_list_counter < tr_list_count    and  tr_list_counter < 24:
       #print( str(tr_list_counter) +": "+ str( tr_list[tr_list_counter]) )
       #print(" ")
       # get this row into variable
       tr_list_row = tr_list[tr_list_counter]
       # find closing </td> tags, by splitting by closing tag
       td_list = tr_list_row.split('</td>')
       td_list_count = len(td_list)
       # count goes from 0 to 18, with 18 being blank, probably the closing tr tag
       td_list_counter = 0
       while td_list_counter < td_list_count:
           # I only want the last 3 tags, but 18 is blank,
           # so I use 5, to get list row with index starting at 14,
           # bcuz list index is zero-based
           if (td_list_counter > (td_list_count  - 5) ):
               #print(str(td_list_counter) +": "+ str(td_list[td_list_counter]).strip() )
               #print(" ")
               # remove td opening tag, using slice method, starting at postion 4
               # append a 0 as string to the strip, so any empty td cell will be treated as a value of 0
               td_inner = td_list[td_list_counter].strip()[4:] + "0"

							 # need to use the td_list_counter to assign this data to the proper variable
               #print( str(td_list_counter) +": "+ str(len(td_inner)) +": "+ str(td_inner) + "|" )

               # which column (list counter) is this data point?
               if (td_list_counter == 15):
                   # convert to float and add to precip 1 hr string
                   precip_1_hr_sum = precip_1_hr_sum + float(td_inner)
               if (td_list_counter == 16):
                   # convert to float and add to precip 3 hr string
                   precip_3_hr_sum = precip_3_hr_sum + float(td_inner)
               if (td_list_counter == 17):
                   # convert to float and add to precip 6 hr string
                   precip_6_hr_sum = precip_6_hr_sum + float(td_inner)


           # increment td counter to grab next td cell
           td_list_counter = td_list_counter + 1


       # increment counter to grab next row
       tr_list_counter = tr_list_counter +1

    # print the sums
    print( "precip_1_hr_sum= "+ str(precip_1_hr_sum) )
    print( "precip_3_hr_sum= "+ str(precip_3_hr_sum) )
    print( "precip_6_hr_sum= "+ str(precip_6_hr_sum) )


else :
        # the http request had an error
    print("http error")
	# 2024-06-10
	# simple python code to scrape html webpage from weather.gov (requires only 1 library named "requests")
	# and parse values of columns "precipitation" aggregated by 1 hour, 3 hours, 6 hours
	# and loop thru each row of data from past 24 hours,
	# and sum the values to get a total rainfall amount in inches
	# to be used to automate the watering system of a garden


	import requests # http requests, needed to get http data

	# use requests library method to connect to website and grab data
	# url is for nat weather observations from Dunkirk NY
	NATWEATHER_KDKK_URL = "https://forecast.weather.gov/data/obhistory/KDKK.html"
	# used to be... "https://w1.weather.gov/data/obhistory/KDKK.html"
	response = requests.get(NATWEATHER_KDKK_URL)
	#print("http response="+ str(response.text) )
	#print("")

	# need to check response code to make sure it was 200 (OK)
	if response.status_code == 200 :

	#intialize sum vars
	precip_1_hr_sum = 0
	precip_3_hr_sum = 0
	precip_6_hr_sum = 0

	# find the data table
	find_str_start = '<tbody>'
	find_str_end = '</tbody>'

	# get position of this string
	http_table_start_position = response.text.find(find_str_start)
	# get closing table tag, starting from position where table opening position
	http_table_end_position = response.text.find(find_str_end, http_table_start_position )

	# extract using slicing start:end-1
	# but add 7 to include "</tbody>"
	http_table = response.text[http_table_start_position:http_table_end_position + 7]


	# find closing tr tag, by splitting the data into a list of strings
	tr_list = http_table.split('</tr>')
	# get count of list rows
	tr_list_count = len(tr_list)

	# access each list row using a zero-based index
	tr_list_counter = 0
	# loop thru list, getting last 23 hours
	while tr_list_counter < tr_list_count and tr_list_counter < 24:
	#print( str(tr_list_counter) +": "+ str( tr_list[tr_list_counter]) )
	#print(" ")
	# get this row into variable
	tr_list_row = tr_list[tr_list_counter]
	# find closing </td> tags, by splitting by closing tag
	td_list = tr_list_row.split('</td>')
	td_list_count = len(td_list)
	# count goes from 0 to 18, with 18 being blank, probably the closing tr tag
	td_list_counter = 0
	while td_list_counter < td_list_count:
	# I only want the last 3 tags, but 18 is blank,
	# so I use 5, to get list row with index starting at 14,
	# bcuz list index is zero-based
	if (td_list_counter > (td_list_count - 5) ):
	#print(str(td_list_counter) +": "+ str(td_list[td_list_counter]).strip() )
	#print(" ")
	# remove td opening tag, using slice method, starting at postion 4
	# append a 0 as string to the strip, so any empty td cell will be treated as a value of 0
	td_inner = td_list[td_list_counter].strip()[4:] + "0"

	# need to use the td_list_counter to assign this data to the proper variable
	#print( str(td_list_counter) +": "+ str(len(td_inner)) +": "+ str(td_inner) + "\|" )

	# which column (list counter) is this data point?
	if (td_list_counter == 15):
	# convert to float and add to precip 1 hr string
	precip_1_hr_sum = precip_1_hr_sum + float(td_inner)
	if (td_list_counter == 16):
	# convert to float and add to precip 3 hr string
	precip_3_hr_sum = precip_3_hr_sum + float(td_inner)
	if (td_list_counter == 17):
	# convert to float and add to precip 6 hr string
	precip_6_hr_sum = precip_6_hr_sum + float(td_inner)


	# increment td counter to grab next td cell
	td_list_counter = td_list_counter + 1



	# increment counter to grab next row
	tr_list_counter = tr_list_counter +1

	# print the sums
	print( "precip_1_hr_sum= "+ str(precip_1_hr_sum) )
	print( "precip_3_hr_sum= "+ str(precip_3_hr_sum) )
	print( "precip_6_hr_sum= "+ str(precip_6_hr_sum) )


	else :
	# the http request had an error
	print("http error")
No results found