Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save AdrianoPereira/d6618812c63079d7c309b6213a435103 to your computer and use it in GitHub Desktop.

Select an option

Save AdrianoPereira/d6618812c63079d7c309b6213a435103 to your computer and use it in GitHub Desktop.
Parallel code for load, extract and save cropped data from GSMaP from global to specigic region bound box
import os
import numpy as np
import gzip
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm
input_base_dir = "/storage/hokusai/GSMaP_NOW/now/half_hour/2024"
output_base_dir = "/storage/hokusai/GSMaP_NOW/SA_2024"
# global coordinates
glat_min = -60.0
glat_max = 60.0
glon_min = -180.0
glon_max = 180.0
# region coordinates
rlat_min = -55.0
rlat_max = 6.0
rlon_min = -83.0
rlon_max = -33.0
def read_data(filepath):
with gzip.open(filepath, mode='rb') as handle:
data = np.frombuffer(handle.read(), dtype=np.float32).reshape(1200, 3600)
data = np.roll(data, shift=1800, axis=1)[::-1]
return data
def crop_data(data, lat_min, lat_max, lon_min, lon_max):
lat_resolution = (glat_max - glat_min) / data.shape[0]
lat_idx_min = int((glat_max - lat_max) / lat_resolution)
lat_idx_max = int((glat_max - lat_min) / lat_resolution)
lon_resolution = (glon_max - glon_min) / data.shape[1]
lon_idx_min = int((lon_min - glon_min) / lon_resolution)
lon_idx_max = int((lon_max - glon_min) / lon_resolution)
cropped_data = data[lat_idx_min:lat_idx_max, lon_idx_min:lon_idx_max]
return cropped_data
def process_file(file_info):
input_file, input_dir, output_dir = file_info
relative_path = input_file.relative_to(input_dir)
output_file = output_dir / relative_path
output_file.parent.mkdir(parents=True, exist_ok=True)
try:
data = read_data(input_file)
cropped_data = crop_data(data, rlat_min, rlat_max, rlon_min, rlon_max)
with gzip.open(output_file, 'wb') as f_out:
f_out.write(cropped_data.astype(np.float32).tobytes())
return True, input_file
except Exception as e:
return False, (input_file, e)
def get_file_list(input_dir):
input_dir = Path(input_dir)
files = []
for root, _, filenames in os.walk(input_dir):
for filename in filenames:
if filename.endswith('.gz'):
files.append((Path(root) / filename, input_dir, Path(output_base_dir)))
return files
def main():
files = get_file_list(input_base_dir)
print(f"Total de arquivos a processar: {len(files)}")
with ProcessPoolExecutor() as executor:
futures = {executor.submit(process_file, file_info): file_info[0] for file_info in files}
for future in tqdm(as_completed(futures), total=len(futures), desc="Processando arquivos"):
success, result = future.result()
if not success:
file_path, error = result
print(f"Erro ao processar {file_path}: {error}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment