nocollier/land_area_variance.py

## land_area_variance.py
"""
What kind of land area spread is there among CMIP6 models?

This analysis will remove some models:
- For no 'areacella': EC-Earth3-LR, FGOALS-f3-L, NESM3
- For anomalously low values: EC-Earth3, EC-Earth3-Veg, EC-Earth3P, EC-Earth3P-HR

          land_area  relative_error [%]
count            74
mean   1.489046e+14           -0.064018
std    3.064177e+12            2.056495
min    1.445632e+14           -2.977698
25%    1.470748e+14           -1.292094
50%    1.487755e+14           -0.150658
75%    1.497160e+14            0.480537
max    1.663004e+14           11.610992

"""

import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr

from intake_esgf import ESGFCatalog

Z_SCORE_THRESHOLD = 3  # Beyond this, we consider you an outlier
EARTH_LAND_AREA = 1.49e14  # Internet value I found


def compute_land_area(ds: xr.Dataset) -> float:
    if "areacella" not in ds:
        warnings.warn(f"areacella not found for {ds.attrs['source_id']}")
        return np.nan
    if not np.allclose(ds["sftlf"].max(), 100.0):
        warnings.warn(
            f"sftlf for {ds.attrs['source_id']} not in the expected range [{float(ds['sftlf'].min().values)},{float(ds['sftlf'].max().values)}]"
        )
    land_area = float(
        (ds["sftlf"] * 0.01 * ds["areacella"]).sum(ds["areacella"].dims).values
    )
    return land_area


# Find all the land fraction datasets and then compute the land areas
cat = ESGFCatalog().search(variable_id="sftlf").remove_ensembles()
cat.df = cat.df.groupby("source_id").first().reset_index()
dsd = cat.to_dataset_dict()
df = pd.DataFrame(
    [
        {"source_id": ds.attrs["source_id"], "land_area": compute_land_area(ds)}
        for _, ds in dsd.items()
    ]
).dropna()

# Some are anomalously off, lets remove then
Z = np.abs((df["land_area"] - df["land_area"].mean()) / df["land_area"].std())
drops = df[Z > Z_SCORE_THRESHOLD]
if not drops.empty:
    warnings.warn(f"Dropping some models as they are outliers:\n{drops}")
    df = df.drop(index=drops.index)

# Add in a relative error based on the published value of 149 [km2]
df["relative_error [%]"] = (df["land_area"] - EARTH_LAND_AREA) / EARTH_LAND_AREA * 100

# Results
print(df.sort_values("land_area").to_string())
print(df.describe())
df.hist()
plt.savefig("land_area.png")
plt.close()
	"""
	What kind of land area spread is there among CMIP6 models?

	This analysis will remove some models:
	- For no 'areacella': EC-Earth3-LR, FGOALS-f3-L, NESM3
	- For anomalously low values: EC-Earth3, EC-Earth3-Veg, EC-Earth3P, EC-Earth3P-HR

	land_area relative_error [%]
	count 74
	mean 1.489046e+14 -0.064018
	std 3.064177e+12 2.056495
	min 1.445632e+14 -2.977698
	25% 1.470748e+14 -1.292094
	50% 1.487755e+14 -0.150658
	75% 1.497160e+14 0.480537
	max 1.663004e+14 11.610992

	"""

	import warnings

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import xarray as xr

	from intake_esgf import ESGFCatalog

	Z_SCORE_THRESHOLD = 3 # Beyond this, we consider you an outlier
	EARTH_LAND_AREA = 1.49e14 # Internet value I found


	def compute_land_area(ds: xr.Dataset) -> float:
	if "areacella" not in ds:
	warnings.warn(f"areacella not found for {ds.attrs['source_id']}")
	return np.nan
	if not np.allclose(ds["sftlf"].max(), 100.0):
	warnings.warn(
	f"sftlf for {ds.attrs['source_id']} not in the expected range [{float(ds['sftlf'].min().values)},{float(ds['sftlf'].max().values)}]"
	)
	land_area = float(
	(ds["sftlf"] * 0.01 * ds["areacella"]).sum(ds["areacella"].dims).values
	)
	return land_area


	# Find all the land fraction datasets and then compute the land areas
	cat = ESGFCatalog().search(variable_id="sftlf").remove_ensembles()
	cat.df = cat.df.groupby("source_id").first().reset_index()
	dsd = cat.to_dataset_dict()
	df = pd.DataFrame(
	[
	{"source_id": ds.attrs["source_id"], "land_area": compute_land_area(ds)}
	for _, ds in dsd.items()
	]
	).dropna()

	# Some are anomalously off, lets remove then
	Z = np.abs((df["land_area"] - df["land_area"].mean()) / df["land_area"].std())
	drops = df[Z > Z_SCORE_THRESHOLD]
	if not drops.empty:
	warnings.warn(f"Dropping some models as they are outliers:\n{drops}")
	df = df.drop(index=drops.index)

	# Add in a relative error based on the published value of 149 [km2]
	df["relative_error [%]"] = (df["land_area"] - EARTH_LAND_AREA) / EARTH_LAND_AREA * 100

	# Results
	print(df.sort_values("land_area").to_string())
	print(df.describe())
	df.hist()
	plt.savefig("land_area.png")
	plt.close()
No results found