eHorn96/preprocessing.py

## preprocessing.py
def graph_download(coords:list,
                   root_dir:str,
                   name:str,simplified=False,
                   distance=9000,
                   keep_geometries=False,
                   nominatim_endpoint='http://localhost:8088/',
                   network_type='drive'):

    graph_attrs = {'crs': 'epsg:4326',
                'simplified': simplified
                }
    cols_dtypes = {"u":int,"v":int,
                   "oneway":bool,
                   "maxspeed":float,
                   "reversed":bool,
                   "length":float,
                   "lanes":float,
                   "access":bool,
                   "bridge":bool,
                   "width":float,
                   "tunnel":bool,
                   "u_highway":str,
                   "v_highway":str
                }

    fp = osp.join(root_dir)
    raw_fp = osp.join(fp,'raw')
    surface_tags = []
    maxspeed_tags = []
    highway_tags = []

    if not os.path.exists(raw_fp +'/edges.csv'):
        try:
            os.makedirs(fp)
        except FileExistsError as error:
            print(error)
        print("Files don't exist in path. Downloading through Overpass.")
        G = ox.graph_from_point(coords,
                                dist=distance,
                                dist_type="network",
                                network_type=network_type,
                                simplify=False,
                                truncate_by_edge=True)
        #print("Got Graph.")
        G = ox.speed.add_edge_speeds(G)
        nodes,edges = ox.graph_to_gdfs(G,
                                        nodes=True,
                                        node_geometry=keep_geometries,
                                        fill_edge_geometry=keep_geometries)
        #print("Got DataFrames")

        useless_columns = ['junction','geometry','ref','name',
                           'osmid','area','est_width']
        useless_columns_edges = [i for i in useless_columns if
                                 i in edges.columns]
        edges.drop(useless_columns_edges,axis=1,inplace=True)
        #print("Dropped useless columns.")

        edges.drop('maxspeed',axis=1,inplace=True)

        if "tunnel" in edges.columns:
            edges.tunnel.replace("building_passage",1,inplace=True)
            edges.tunnel.fillna(0,inplace=True)
            edges.tunnel.replace("yes",1,inplace=True)
            edges.tunnel.replace("no",0,inplace=True)
            #print("Processed tunnel column.")

        if "bridge" in edges.columns:
            edges.bridge.fillna(0,inplace=True)
            edges.bridge.replace("yes",1,inplace=True)
            edges.bridge.replace("no",0,inplace=True)
            #print("Processed Bridge column.")

        if "access" in edges.columns:
            s = edges.access.value_counts()
            edges.access = np.where(edges.access.isin(
                s.index[s < len(edges)*0.025]), 'yes',
                                    edges.access)
            edges.access.fillna(1,inplace=True)
            edges.access.replace("no",0,inplace=True)
            edges.access.replace("yes",1,inplace=True)
            #print("Processed access column.")
        if "width" in edges.columns:
            edges.width = edges.width.str.extract('(\d+)').astype(float)
            #print("Processed width column.")

        if "highway" in nodes.columns:
            #print(f"Frequency of null-highways: {len(np.where(nodes.highway == np.nan)[0])}")
            def fillhighway(row):
                if isinstance(row.highway,float):
                    res_obj = requests.get(nominatim_endpoint + f'reverse.php?lat={row.y}&lon={row.x}&format=jsonv2&extratags=1&zoom=17').json()
                    return res_obj["type"]
                else:
                    return row["highway"]
            nodes.highway= nodes.apply(fillhighway,axis=1)
            s = nodes.highway.value_counts()
            nodes.highway = np.where(nodes.highway.isin(
                s.index[s < (len(nodes)*0.025)]), 'residential',
                                     nodes.highway)
            #nodes.highway.fillna('residential',inplace=True)

            #print("Processed highway column.")


        tmp = []
        for l in tqdm(edges.lanes,desc = "lanes"):
            if isinstance(l,list):
                tmp.append(sum(list(float(f) for f in l)))
            elif isinstance(l,str):
                try:
                    tmp.append(float(l))
                except:
                    tmp.append(np.nan)
            else:
                tmp.append(l)

        edges.lanes = tmp
        edges.lanes = edges.lanes.astype(np.float32)
        del tmp
        #print("Processed lanes column.")

        u_highway=[]
        v_highway=[]

        edge_highways = []
        for h in tqdm(edges.highway.to_list()):
            if isinstance(h,str):
                edge_highways.append([h,h])
            elif isinstance(h,list):
                edge_highways.append([h[0],h[1]])
            elif isinstance(h,float):
                edge_highways.append(["unclassified","unclassified"])
            else:
                raise TypeError(
                    f"Could not unnest value with type: {type(h)}"
                    )

        edges['u_highway'] = np.array(edge_highways)[0:,0]
        edges['v_highway'] = np.array(edge_highways)[0:,1]
        edges.drop('highway',axis=1,inplace=True)

        assert len(
            np.where(
                edges.index.to_flat_index().to_numpy() == np.nan)[0] ) == 0
        if 'ref' in nodes.columns:
            nodes.drop("ref",axis=1,inplace=True)

        edges.u_highway = edges.u_highway.astype("category")
        edges.v_highway = edges.v_highway.astype("category")
    else:
        print("Found csv files.")
        nodes = gpd.read_file(fp + 'nodes.csv',
                              layer = 'nodes').convert_dtypes().set_index('osmid')
        edges = gpd.read_file(fp + 'edges.csv',
                              layer='edges').convert_dtypes().set_index(
                                  ['u','v','key'])
        assert nodes.index.is_unique and edges.index.is_unique
        G = ox.graph_from_gdfs(nodes,edges, graph_attrs)

    y = np.zeros(len(nodes),dtype=np.int8)
    unf = unfaelle[unfaelle.apply(filter, args = (coords[0],coords[1],2*distance), axis=1)]
    unf = unf[["YGCSWGS84","XGCSWGS84"]].to_numpy()
    pts = nodes[['y','x']].to_numpy()
    ndsidx = nodes.index.to_flat_index().to_numpy()

    tree = spatial.KDTree(pts)
    for p in tqdm(range(len(unf)),'Labeling'):
        distance, index = tree.query(unf[p],k=5)
        for result in range(len(distance)):
            if distance[result] <= 6e-3: #Tuned for Germany. Distances based on
                y[index[result]] = 1

    edge_attr_ang = np.array(
        [
            ang_gen(
                idx[0],
                idx[1],
                G=G
                ) for idx in edges.index
        ],
        dtype=np.float32)
    # All rows where ang_gen didnt work get -1 in same shape as edge_attr_ang (1,3)
    edge_attr_ang[
        np.where(
            np.isnan(
                edge_attr_ang
                ) == True )[0]
        ] = np.array([-1,-1,-1],dtype=np.float32)
    #edge_attr_ang = torch.from_numpy(edge_attr_ang).to(torch.float)
#
    edge_attr_dir = np.array([dir_gen(idx[0],
                    idx[1],G=G) for idx in edges.index],
                    dtype=np.float32)
    #print("""Tensorized edge attributes, angular and
     #     directional component.""")
    if "geometry" in edges.columns:
        edges.drop("geometry",axis=1,inplace=True)
    if "geometry" in nodes.columns:
        nodes.drop("geometry",axis=1,inplace=True)
    edges.to_csv( fp + 'edges.csv')
    #print("saved edges to csv")
    nodes.to_csv(fp + 'nodes.csv')
    #print("saved nodes to csv")
    edges = pd.read_csv(fp + 'edges.csv',dtype=cols_dtypes)
    nodes = pd.read_csv(fp + 'nodes.csv')
    #print("Converted to pandas DataFrames.")

    benc_highway = OneHotEncoder(return_df=True,drop_invariant=False)
    highways = edges['u_highway'].to_numpy()
    benc_highway.fit(highways)
    #print("Fit highway column.")

    edges_encoded_u = benc_highway.transform(
        edges['u_highway'].to_numpy())
    edges_encoded_v = benc_highway.transform(
        edges['v_highway'].to_numpy())

    nodes_encoded = benc_highway.transform(
        nodes['highway'].to_numpy())
    #print("encoded highway column.")

    edges = pd.concat([edges,
                       edges_encoded_u,
                       edges_encoded_v],
                      axis = 1)
    nodes = pd.concat([nodes,
                       nodes_encoded],
                      axis=1)

    edge_index = np.array([
        np.array(
            [
                np.where(nodes['osmid'] == i[0])[0],
                np.where(nodes['osmid'] == i[1])[0]
            ]
            ).flatten() for i in tqdm(edges[['u','v']].to_numpy(),
                      "Creating Edge index.")
        ],dtype=np.int32)
    edge_index
    coords = torch.from_numpy(
        nodes.iloc[:,1:3].to_numpy()).to(torch.float)
    #print("Imputing missing values...")

    edges_imputer = IterativeImputer()
    nodes_imputer = IterativeImputer()

    edges.drop(['u',
                'v',
                'key',
                'u_highway',
                'v_highway'],
               axis=1,inplace=True)
    edges = edges.apply(pd.to_numeric,errors='coerce')
    edges = edges_imputer.fit_transform(edges)
    edges = np.concatenate([edges,edge_attr_ang,edge_attr_dir],axis=1)
    #print("Done for Edges.")

    nodes.drop(['y','x','osmid','highway'],axis=1,inplace=True)
    nodes = nodes.apply(pd.to_numeric,errors='coerce')
    nodes = nodes_imputer.fit_transform(nodes)
    #print("Done for Nodes.")
    #x = torch.from_numpy(nodes).to(torch.float)
    #y = torch.from_numpy(y).to(torch.long)
    #
    #edge_attr = torch.from_numpy(edges).to(torch.float)

    print(
        f"Number of nodes: {len(nodes)}"
    )

    print(
        f"Number of node features: {len(nodes[0])}"
    )

    print(
        f"Number of edges: {len(edges)}"
    )

    print(
        f"Positive Class relative frequency: {len(y[y==1])/len(y)}"
    )


    np.savez(f'{osp.join(fp,name,"raw",name)}.npz',
        x=nodes,
        y=y,
        edge_attr=edges,edge_index=edge_index,
        edge_attr_ang=edge_attr_ang,
        edge_attr_dir=edge_attr_dir,
        coordinates=coords)
	def graph_download(coords:list,
	root_dir:str,
	name:str,simplified=False,
	distance=9000,
	keep_geometries=False,
	nominatim_endpoint='http://localhost:8088/',
	network_type='drive'):

	graph_attrs = {'crs': 'epsg:4326',
	'simplified': simplified
	}
	cols_dtypes = {"u":int,"v":int,
	"oneway":bool,
	"maxspeed":float,
	"reversed":bool,
	"length":float,
	"lanes":float,
	"access":bool,
	"bridge":bool,
	"width":float,
	"tunnel":bool,
	"u_highway":str,
	"v_highway":str
	}

	fp = osp.join(root_dir)
	raw_fp = osp.join(fp,'raw')
	surface_tags = []
	maxspeed_tags = []
	highway_tags = []

	if not os.path.exists(raw_fp +'/edges.csv'):
	try:
	os.makedirs(fp)
	except FileExistsError as error:
	print(error)
	print("Files don't exist in path. Downloading through Overpass.")
	G = ox.graph_from_point(coords,
	dist=distance,
	dist_type="network",
	network_type=network_type,
	simplify=False,
	truncate_by_edge=True)
	#print("Got Graph.")
	G = ox.speed.add_edge_speeds(G)
	nodes,edges = ox.graph_to_gdfs(G,
	nodes=True,
	node_geometry=keep_geometries,
	fill_edge_geometry=keep_geometries)
	#print("Got DataFrames")

	useless_columns = ['junction','geometry','ref','name',
	'osmid','area','est_width']
	useless_columns_edges = [i for i in useless_columns if
	i in edges.columns]
	edges.drop(useless_columns_edges,axis=1,inplace=True)
	#print("Dropped useless columns.")

	edges.drop('maxspeed',axis=1,inplace=True)

	if "tunnel" in edges.columns:
	edges.tunnel.replace("building_passage",1,inplace=True)
	edges.tunnel.fillna(0,inplace=True)
	edges.tunnel.replace("yes",1,inplace=True)
	edges.tunnel.replace("no",0,inplace=True)
	#print("Processed tunnel column.")

	if "bridge" in edges.columns:
	edges.bridge.fillna(0,inplace=True)
	edges.bridge.replace("yes",1,inplace=True)
	edges.bridge.replace("no",0,inplace=True)
	#print("Processed Bridge column.")

	if "access" in edges.columns:
	s = edges.access.value_counts()
	edges.access = np.where(edges.access.isin(
	s.index[s < len(edges)*0.025]), 'yes',
	edges.access)
	edges.access.fillna(1,inplace=True)
	edges.access.replace("no",0,inplace=True)
	edges.access.replace("yes",1,inplace=True)
	#print("Processed access column.")
	if "width" in edges.columns:
	edges.width = edges.width.str.extract('(\d+)').astype(float)
	#print("Processed width column.")

	if "highway" in nodes.columns:
	#print(f"Frequency of null-highways: {len(np.where(nodes.highway == np.nan)[0])}")
	def fillhighway(row):
	if isinstance(row.highway,float):
	res_obj = requests.get(nominatim_endpoint + f'reverse.php?lat={row.y}&lon={row.x}&format=jsonv2&extratags=1&zoom=17').json()
	return res_obj["type"]
	else:
	return row["highway"]
	nodes.highway= nodes.apply(fillhighway,axis=1)
	s = nodes.highway.value_counts()
	nodes.highway = np.where(nodes.highway.isin(
	s.index[s < (len(nodes)*0.025)]), 'residential',
	nodes.highway)
	#nodes.highway.fillna('residential',inplace=True)

	#print("Processed highway column.")


	tmp = []
	for l in tqdm(edges.lanes,desc = "lanes"):
	if isinstance(l,list):
	tmp.append(sum(list(float(f) for f in l)))
	elif isinstance(l,str):
	try:
	tmp.append(float(l))
	except:
	tmp.append(np.nan)
	else:
	tmp.append(l)

	edges.lanes = tmp
	edges.lanes = edges.lanes.astype(np.float32)
	del tmp
	#print("Processed lanes column.")

	u_highway=[]
	v_highway=[]

	edge_highways = []
	for h in tqdm(edges.highway.to_list()):
	if isinstance(h,str):
	edge_highways.append([h,h])
	elif isinstance(h,list):
	edge_highways.append([h[0],h[1]])
	elif isinstance(h,float):
	edge_highways.append(["unclassified","unclassified"])
	else:
	raise TypeError(
	f"Could not unnest value with type: {type(h)}"
	)

	edges['u_highway'] = np.array(edge_highways)[0:,0]
	edges['v_highway'] = np.array(edge_highways)[0:,1]
	edges.drop('highway',axis=1,inplace=True)

	assert len(
	np.where(
	edges.index.to_flat_index().to_numpy() == np.nan)[0] ) == 0
	if 'ref' in nodes.columns:
	nodes.drop("ref",axis=1,inplace=True)

	edges.u_highway = edges.u_highway.astype("category")
	edges.v_highway = edges.v_highway.astype("category")
	else:
	print("Found csv files.")
	nodes = gpd.read_file(fp + 'nodes.csv',
	layer = 'nodes').convert_dtypes().set_index('osmid')
	edges = gpd.read_file(fp + 'edges.csv',
	layer='edges').convert_dtypes().set_index(
	['u','v','key'])
	assert nodes.index.is_unique and edges.index.is_unique
	G = ox.graph_from_gdfs(nodes,edges, graph_attrs)

	y = np.zeros(len(nodes),dtype=np.int8)
	unf = unfaelle[unfaelle.apply(filter, args = (coords[0],coords[1],2*distance), axis=1)]
	unf = unf[["YGCSWGS84","XGCSWGS84"]].to_numpy()
	pts = nodes[['y','x']].to_numpy()
	ndsidx = nodes.index.to_flat_index().to_numpy()

	tree = spatial.KDTree(pts)
	for p in tqdm(range(len(unf)),'Labeling'):
	distance, index = tree.query(unf[p],k=5)
	for result in range(len(distance)):
	if distance[result] <= 6e-3: #Tuned for Germany. Distances based on
	y[index[result]] = 1

	edge_attr_ang = np.array(
	[
	ang_gen(
	idx[0],
	idx[1],
	G=G
	) for idx in edges.index
	],
	dtype=np.float32)
	# All rows where ang_gen didnt work get -1 in same shape as edge_attr_ang (1,3)
	edge_attr_ang[
	np.where(
	np.isnan(
	edge_attr_ang
	) == True )[0]
	] = np.array([-1,-1,-1],dtype=np.float32)
	#edge_attr_ang = torch.from_numpy(edge_attr_ang).to(torch.float)
	#
	edge_attr_dir = np.array([dir_gen(idx[0],
	idx[1],G=G) for idx in edges.index],
	dtype=np.float32)
	#print("""Tensorized edge attributes, angular and
	# directional component.""")
	if "geometry" in edges.columns:
	edges.drop("geometry",axis=1,inplace=True)
	if "geometry" in nodes.columns:
	nodes.drop("geometry",axis=1,inplace=True)
	edges.to_csv( fp + 'edges.csv')
	#print("saved edges to csv")
	nodes.to_csv(fp + 'nodes.csv')
	#print("saved nodes to csv")
	edges = pd.read_csv(fp + 'edges.csv',dtype=cols_dtypes)
	nodes = pd.read_csv(fp + 'nodes.csv')
	#print("Converted to pandas DataFrames.")

	benc_highway = OneHotEncoder(return_df=True,drop_invariant=False)
	highways = edges['u_highway'].to_numpy()
	benc_highway.fit(highways)
	#print("Fit highway column.")

	edges_encoded_u = benc_highway.transform(
	edges['u_highway'].to_numpy())
	edges_encoded_v = benc_highway.transform(
	edges['v_highway'].to_numpy())

	nodes_encoded = benc_highway.transform(
	nodes['highway'].to_numpy())
	#print("encoded highway column.")

	edges = pd.concat([edges,
	edges_encoded_u,
	edges_encoded_v],
	axis = 1)
	nodes = pd.concat([nodes,
	nodes_encoded],
	axis=1)

	edge_index = np.array([
	np.array(
	[
	np.where(nodes['osmid'] == i[0])[0],
	np.where(nodes['osmid'] == i[1])[0]
	]
	).flatten() for i in tqdm(edges[['u','v']].to_numpy(),
	"Creating Edge index.")
	],dtype=np.int32)
	edge_index
	coords = torch.from_numpy(
	nodes.iloc[:,1:3].to_numpy()).to(torch.float)
	#print("Imputing missing values...")

	edges_imputer = IterativeImputer()
	nodes_imputer = IterativeImputer()

	edges.drop(['u',
	'v',
	'key',
	'u_highway',
	'v_highway'],
	axis=1,inplace=True)
	edges = edges.apply(pd.to_numeric,errors='coerce')
	edges = edges_imputer.fit_transform(edges)
	edges = np.concatenate([edges,edge_attr_ang,edge_attr_dir],axis=1)
	#print("Done for Edges.")

	nodes.drop(['y','x','osmid','highway'],axis=1,inplace=True)
	nodes = nodes.apply(pd.to_numeric,errors='coerce')
	nodes = nodes_imputer.fit_transform(nodes)
	#print("Done for Nodes.")
	#x = torch.from_numpy(nodes).to(torch.float)
	#y = torch.from_numpy(y).to(torch.long)
	#
	#edge_attr = torch.from_numpy(edges).to(torch.float)

	print(
	f"Number of nodes: {len(nodes)}"
	)

	print(
	f"Number of node features: {len(nodes[0])}"
	)

	print(
	f"Number of edges: {len(edges)}"
	)

	print(
	f"Positive Class relative frequency: {len(y[y==1])/len(y)}"
	)



	np.savez(f'{osp.join(fp,name,"raw",name)}.npz',
	x=nodes,
	y=y,
	edge_attr=edges,edge_index=edge_index,
	edge_attr_ang=edge_attr_ang,
	edge_attr_dir=edge_attr_dir,
	coordinates=coords)
No results found