Soil health monitoring using Sentinel-2 MSI in Vanuatu - ML training for Climate Change and Natural Disasters

In this lesson, we will use multi-spectral imagery, cloud-masked and temporally composited, to calculate a comprehensive set of remote sensing indices for characterizing soil health. We’ll summarize the correlations and variance within these indices using Prinicipal Components Analysis (PCA) and calculate a proxy soil health score for each pixel in our composite.

import os
import glob
import math
import pyproj
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import geopandas as gpd
import rasterio
from rasterio.features import geometry_mask
from sklearn.decomposition import PCA
import tqdm
import xarray as xr
import rioxarray as rxr
import planetary_computer
import pystac_client
from pystac_client import Client
import odc.stac
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from shapely.geometry import mapping, shape, MultiPolygon, Point, Polygon, box
from shapely.ops import transform
import dask
import dask.array as da
import dask.delayed

admin_boundaries_gdf = gpd.read_file("./2016_phc_vut_pid_4326.geojson")
#admin_boundaries_gdf = admin_boundaries_gdf.set_index(keys="pname")  # set province name as the index

admin_boundaries_gdf

PROVINCE = "SHEFA"
DATERANGE_START = "2025-04-01"
DATERANGE_END = "2025-08-01"

admin_boundaries_gdf.columns

Index(['pid', 'pname', 'geometry'], dtype='object')

# Filter to one province (e.g., SHEFA)
#admin_boundaries_gdf = admin_boundaries_gdf.to_crs("EPSG:32759")
province_match = admin_boundaries_gdf[admin_boundaries_gdf["pname"] == PROVINCE]

province_match

province_match.geometry

4    MULTIPOLYGON (((168.30762 -17.77548, 168.30773...
Name: geometry, dtype: geometry

mp = province_match.loc[4, "geometry"]

if isinstance(mp, MultiPolygon):
    # Sort polygons by area
    polys_sorted = sorted(mp.geoms, key=lambda p: p.area)

    if len(polys_sorted) >= 3:
        #selected = polys_sorted[len(polys_sorted) // 2]
        selected = polys_sorted[-5]
    elif len(polys_sorted) == 2:
        # For just two, pick the larger or smaller depending on definition
        selected = polys_sorted[0]  # or polys_sorted[1]
    else:
        selected = polys_sorted[0]  # Only one polygon

else:
    selected = mp  # Already a single polygon

print(len(polys_sorted))
selected

# Access AWS STAC for Sentinel-2 Data
aws_stac_url = "https://earth-search.aws.element84.com/v1"
stac_client = Client.open(aws_stac_url)

# Search Sentinel-2 data on AWS with cloud cover less than 50%
s2_search = stac_client.search(
    collections=["sentinel-2-l2a"],
    bbox=selected.bounds,
    datetime=f"{DATERANGE_START}/{DATERANGE_END}", 
    query={"eo:cloud_cover": {"lt": 50}}  # Filter by cloud cover < 50%
)

# Retrieve all items from search results
s2_items = s2_search.item_collection()

len(s2_items)

48

s2_data = odc.stac.load(
    items=s2_items,
    bands=["red", "green", "blue", "nir", "swir16", "scl"],
    bbox=selected.bounds,
    crs="EPSG:32759", #"EPSG:32759",
    chunks={'x': 1024, 'y': 1024, 'bands': -1, 'time': -1},
    resolution=30
)

bands = ['red', 'green', 'blue', 'nir', 'swir16']

cloud_classes = [3, 8, 9, 10]  # Cloud-related SCL classes
cloud_mask = s2_data['scl'].isin(cloud_classes)
s2_data_masked = s2_data[bands].where(~cloud_mask, drop=False)  # Keep all pixels

# Average across the time dimension
s2_data_composite = s2_data_masked.median(dim='time')

s2_data_composite

s2_data_composite[["red", "green", "blue"]].to_array("band").plot.imshow(rgb="band", robust=True, size=5, aspect=1)

Mask land¶

project = pyproj.Transformer.from_crs("EPSG:4326", "EPSG:32759", always_xy=True).transform
selected_proj = transform(project, selected)

mask_land = geometry_mask(
    geometries=[mapping(selected_proj)],
    transform=s2_data_composite.odc.transform,
    out_shape=(s2_data_composite.sizes["y"], s2_data_composite.sizes["x"]),
    invert=True  # We want True = inside the buffer
)

mask_land_xr = xr.DataArray(
    mask_land,
    dims=("y", "x"),
    coords={"y": s2_data_composite.y, "x": s2_data_composite.x}
)

s2_data_composite_masked = s2_data_composite.where(mask_land_xr)

s2_data_composite_masked[["red", "green", "blue"]].to_array("band").plot.imshow(rgb="band", robust=True, size=5, aspect=1)

Remote sensing indices for soil health¶

Monitoring soil health and vegetation condition using satellite imagery relies on various spectral indices that capture moisture, vegetation vigor, soil exposure, and disturbance.

The Normalized Difference Moisture Index (NDMI) assesses vegetation and soil moisture by comparing near-infrared (NIR) and short-wave infrared (SWIR) reflectance, providing insight into water availability crucial for plant growth.

The Bare Soil Index (BSI) highlights areas of exposed or sparsely vegetated soil, which can be indicators of erosion or land degradation.

Vegetation vigor and cover are captured by the Normalized Difference Vegetation Index (NDVI), a widely used metric that serves as a proxy for biomass and root activity, thereby indirectly indicating soil protection from erosion.

To improve vegetation detection in areas with significant soil exposure, Soil-Adjusted Vegetation Index (SAVI) and its modification MSAVI2 are used; they adjust for soil brightness effects, enabling more accurate vegetation assessment in sparse or mixed soil-vegetation landscapes.

Lastly, the Normalized Burn Ratio (NBR) is sensitive to vegetation disturbance such as fire damage and also reflects exposed soil and land degradation processes. Together, these indices provide a comprehensive view of soil moisture status, vegetation health, bare soil extent, and disturbance—critical factors for assessing soil condition, erosion risk, and ecosystem health in agricultural and natural landscapes.

def compute_ndmi(nir, swir):
    """
    Compute the Normalized Difference Moisture Index (NDMI).

    NDMI measures vegetation and soil moisture by comparing near-infrared (NIR) and short-wave infrared (SWIR) reflectance.

    Formula:
        NDMI = (NIR - SWIR) / (NIR + SWIR)

    Parameters
    ----------
    nir : xarray.DataArray
        Near-infrared reflectance band.
    swir : xarray.DataArray
        Short-wave infrared reflectance band.

    Returns
    -------
    xarray.DataArray
        NDMI values for each pixel.
    """
    return (nir - swir) / (nir + swir)


def compute_bsi(blue, red, nir, swir):
    """
    Compute the Bare Soil Index (BSI).

    BSI identifies bare or sparsely vegetated soil areas, which can indicate erosion or degraded land.

    Formula:
        BSI = ((SWIR + Red) - (NIR + Blue)) / ((SWIR + Red) + (NIR + Blue))

    Parameters
    ----------
    blue : xarray.DataArray
        Blue band reflectance.
    red : xarray.DataArray
        Red band reflectance.
    nir : xarray.DataArray
        Near-infrared reflectance band.
    swir : xarray.DataArray
        Short-wave infrared reflectance band.

    Returns
    -------
    xarray.DataArray
        BSI values for each pixel.
    """
    return ((swir + red) - (nir + blue)) / ((swir + red) + (nir + blue))


def ndvi(red, nir):
    """
    Compute the Normalized Difference Vegetation Index (NDVI).

    NDVI measures vegetation greenness and health. 
    NDVI gives vegetation cover health (proxy for root activity, biomass and reduced erosion). 

    Formula:
        NDVI = (NIR - Red) / (NIR + Red)

    Parameters
    ----------
    red : xarray.DataArray
        Red band reflectance.
    nir : xarray.DataArray
        Near-infrared reflectance band.

    Returns
    -------
    xarray.DataArray
        NDVI values for each pixel.
    """
    return (nir - red) / (nir + red)


def savi(red, nir, L=0.5):
    """
    Compute the Soil-Adjusted Vegetation Index (SAVI).

    SAVI adjusts NDVI to minimize soil brightness effects, useful in sparse vegetation areas.

    Formula:
        SAVI = ((NIR - Red) * (1 + L)) / (NIR + Red + L)

    Parameters
    ----------
    red : xarray.DataArray
        Red band reflectance.
    nir : xarray.DataArray
        Near-infrared reflectance band.
    L : float, optional
        Soil adjustment factor (default 0.5 for intermediate vegetation cover).

    Returns
    -------
    xarray.DataArray
        SAVI values for each pixel.
    """
    return ((nir - red) * (1 + L)) / (nir + red + L)


def msavi2(red, nir):
    """
    Compute the Modified Soil-Adjusted Vegetation Index 2 (MSAVI2).

    MSAVI2 improves vegetation detection in areas with significant bare soil by removing the need for a soil adjustment factor.

    Formula:
        MSAVI2 = [2 * NIR + 1 - sqrt((2 * NIR + 1)^2 - 8 * (NIR - Red))] / 2

    Parameters
    ----------
    red : xarray.DataArray
        Red band reflectance.
    nir : xarray.DataArray
        Near-infrared reflectance band.

    Returns
    -------
    xarray.DataArray
        MSAVI2 values for each pixel.
    """
    a = 2 * nir + 1
    b = 2 * (nir**2 - nir * red)
    return (a - np.sqrt(a**2 - b)) / 2


def nbr(nir, swir16):
    """
    Compute the Normalized Burn Ratio (NBR).

    NBR detects burned areas and vegetation disturbance.  
    Can also indicate exposed soil and land degradation.

    Formula:
        NBR = (NIR - SWIR) / (NIR + SWIR)

    Parameters
    ----------
    nir : xarray.DataArray
        Near-infrared reflectance band.
    swir16 : xarray.DataArray
        Short-wave infrared reflectance band (SWIR1).

    Returns
    -------
    xarray.DataArray
        NBR values for each pixel.
    """
    return (nir - swir16) / (nir + swir16)


s2_data_composite_masked = s2_data_composite_masked.assign(
    ndmi=compute_ndmi(s2_data_composite_masked['nir'], s2_data_composite_masked['swir16']),
    bsi=compute_bsi(
        s2_data_composite_masked['blue'],
        s2_data_composite_masked['red'],
        s2_data_composite_masked['nir'],
        s2_data_composite_masked['swir16']
    ),
    ndvi=ndvi(s2_data_composite_masked['red'], s2_data_composite_masked['nir']),
    savi=savi(s2_data_composite_masked['red'], s2_data_composite_masked['nir']),
    msavi2=msavi2(s2_data_composite_masked['red'], s2_data_composite_masked['nir']),
    nbr=nbr(s2_data_composite_masked['nir'], s2_data_composite_masked['swir16'])
)


def plot_index(index_data, title, cmap='BrBG'):
    plt.figure(figsize=(6, 4))
    index_data.plot(cmap=cmap, robust=True)  # robust=True avoids outlier influence
    plt.title(title, fontsize=10)
    plt.axis('off')
    plt.show()


plot_index(s2_data_composite_masked['ndmi'], 'NDMI - Soil Moisture Indicator')
plot_index(s2_data_composite_masked['bsi'], 'BSI - Bare Soil (Erosion Proxy)')
plot_index(s2_data_composite_masked['ndvi'], 'NDVI - Vegetation Greenness')
plot_index(s2_data_composite_masked['savi'], 'SAVI - Soil-Adjusted Vegetation Index')
plot_index(s2_data_composite_masked['msavi2'], 'MSAVI2 - Modified SAVI')
plot_index(s2_data_composite_masked['nbr'], 'NBR - Disturbance/Burn Ratio')

PCA analysis¶

Let’s search for the dominant pattern of variation across our indices.

Our PCA’s first principal component (PC1) will represent the dominant combined variation in soil moisture, vegetation greenness, soil adjustment, and disturbance indices.

PC1 is a linear combination of all your input bands/indices weighted to explain the maximum variance in the data. Since we included vegetation and soil indices, PC1 often reflects a gradient between soil and vegetation characteristics. For example, it might differentiate areas with healthy, dense vegetation (high NDVI, high moisture) from areas dominated by bare or sparse soil (high BSI, low NDVI). Pixels with high PC1 scores might correspond to lush, healthy vegetation or moist soils; pixels with low PC1 scores might correspond to bare soil, dry or degraded areas.

Goal: to reduce multiple correlated bands and indices into a single composite index capturing the main “soil-vegetation health axis.” This should help summarize complex spectral information into one interpretable value per pixel.

stack = np.stack([s2_data_composite_masked[v].values for v in 
                  ['ndmi','bsi','ndvi','msavi2','nbr']])  # shape (bands, t, y, x)

stack.shape

(5, 273, 237)

What the PC1 likely represents with these indices:¶

PC1 combines moisture, vegetation health, and disturbance info into one axis that captures the biggest variation in your scene.

Areas with high PC1 scores likely correspond to healthy, moist, undisturbed vegetation. Areas with low PC1 scores likely correspond to bare soil, dry areas, or disturbed/affected vegetation.

Index	Meaning	Expected contribution to PC1
NDMI (Normalized Difference Moisture Index)	Soil/vegetation moisture	Higher values → more moisture → likely positive loading
BSI (Bare Soil Index)	Bare soil presence	Higher values → more bare soil → usually negative loading (since bare soil = poorer vegetation/health)
NDVI (Normalized Difference Vegetation Index)	Vegetation greenness	Higher values → greener vegetation → positive loading
SAVI (Soil Adjusted Vegetation Index)	Vegetation with soil adjustment	Similar to NDVI but corrects soil influence → positive loading
MSAVI2 (Modified Soil Adjusted Vegetation Index 2)	Vegetation with optimized soil adjustment	Also vegetation indicator → positive loading
NBR (Normalized Burn Ratio)	Vegetation disturbance (e.g., fire)	Lower values indicate disturbance; higher values healthy → positive loading expected

D, Y, X = stack.shape

# flatten spatial dims to samples: (N, D) where N = Y*X
stack_reshaped = stack.reshape(D, Y * X).T  # shape (N, D)

# Create mask for rows without any NaNs
valid_mask = ~np.isnan(stack_reshaped).any(axis=1)

# Filter valid rows only
stack_valid = stack_reshaped[valid_mask]

# Optional: sample if too large
max_samples = 10000
if stack_valid.shape[0] > max_samples:
    idx = np.random.choice(stack_valid.shape[0], max_samples, replace=False)
    pca_data = stack_valid[idx]
else:
    pca_data = stack_valid

# Run PCA on valid data only
pca = PCA(n_components=2)
pca.fit(pca_data)

# Transform all valid pixels
pca_scores_valid = pca.transform(stack_valid)

# Prepare full array with NaNs for invalid pixels
pca_scores = np.full((Y * X, 2), np.nan)
pca_scores[valid_mask] = pca_scores_valid

# reshape PC1 back to (Y, X)
pc1 = pca_scores[:, 0].reshape(Y, X)

# create DataArray for PC1
pc1_da = xr.DataArray(
    pc1,
    coords={
        'y': s2_data_composite_masked['y'],
        'x': s2_data_composite_masked['x']
    },
    dims=['y', 'x'],
    name='soil_vegetation_axis_PC1'
)

# reshape PC1 back to (Y, X)
pc2 = pca_scores[:, 1].reshape(Y, X)

# create DataArray for PC1
pc2_da = xr.DataArray(
    pc2,
    coords={
        'y': s2_data_composite_masked['y'],
        'x': s2_data_composite_masked['x']
    },
    dims=['y', 'x'],
    name='soil_vegetation_axis_PC1'
)

# plot
ax = pc1_da.plot(cmap='RdYlGn', robust=True, figsize=(10, 6))
#ax.set_title('Soil-Vegetation Axis (PC1)', fontsize=14)
plt.show()

# plot
ax = pc2_da.plot(cmap='RdYlGn', robust=True, figsize=(10, 6))
#ax.set_title('Soil-Vegetation Axis (PC1)', fontsize=14)
plt.show()

Principal Component 2 (PC2) captures the second largest independent source of variance, orthogonal to PC1. PC2 might represent more subtle or different contrasts in the data, such as:

Variations related to disturbance vs. regrowth (NBR might load more here),
Differences in vegetation structure or type, or
Soil texture or moisture contrasts not fully captured by PC1.

PC2 often helps highlight secondary gradients like areas recovering from disturbance or with different vegetation-soil mixtures.

Let’s check the component loadings:

bands_indices = ['ndmi', 'bsi', 'ndvi', 'msavi2', 'nbr']
loadings_df = pd.DataFrame(pca.components_, columns=bands_indices, index=['PC1', 'PC2'])
print(loadings_df)

         ndmi       bsi      ndvi    msavi2       nbr
PC1  0.000247 -0.000270  0.000309  1.000000  0.000247
PC2 -0.508097  0.544503 -0.432654  0.000531 -0.508098

Each loading is a coefficient that tells you how strongly each original variable (index) contributes to a principal component (PC). Loadings can be positive or negative, indicating the direction of the relationship. The magnitude (absolute value) indicates the strength of influence. PC1 loadings show which indices dominate the largest pattern—usually differentiating healthy, moist, dense vegetation (high NDVI, NDMI, MSAVI2) from bare soil or degraded land (high BSI). PC2 loadings capture the next most important pattern, independent of PC1, often related to disturbance, soil texture, or other subtle contrasts.

PC1 shows:

Dominated almost entirely by msavi2 (loading = 1.0)
Other indices have near-zero loadings, so PC1 essentially represents MSAVI2 alone.
Since MSAVI2 is a soil-adjusted vegetation index, PC1 here captures vegetation health adjusted for soil brightness, ignoring other indices.

PC2 shows:

Strong positive loading on bsi (0.54)
Strong negative loadings on ndmi (-0.51), ndvi (-0.43), and nbr (-0.51)
This suggests PC2 contrasts bare soil presence (BSI) with moisture (NDMI), vegetation greenness (NDVI), and disturbance/burn ratio (NBR).
So PC2 likely captures a bare soil vs vegetated/moist/disturbed gradient:
- High PC2 values mean more bare soil (high BSI)
- Low PC2 values mean more healthy, moist vegetation or recently disturbed areas

PC1 is almost purely MSAVI2, a refined vegetation index accounting for soil brightness. It’s oour main soil-adjusted vegetation axis.

PC2 contrasts bare soil (BSI) with vegetation moisture, greenness, and disturbance — a secondary axis separating exposed soil from vegetated or disturbed areas.

Why is this the case?

Our indices may have very different scales or variance; MSAVI2 dominates variance captured by PC1.
Other indices mainly separate out on PC2, showing a distinct “bare soil vs healthy/disturbed vegetation” gradient.

Takeaways:

Use PC1 as a strong indicator of vegetation health adjusted for soil.
Use PC2 to detect areas with bare soil exposure vs vegetated or disturbed zones.
Can use normalization such as Z-score normalization prior to PCA to account for the different index scales.

Creating a soil health score¶

We will created a score based on weighted inputs.

The weights¶

0.45 * ndmi_n → 45% weight to moisture

Soil moisture is a key driver of nutrient cycling and erosion prevention. Given nearly equal weight with NDVI because water availability is just as important as > vegetation cover.

0.45 * ndvi_n → 45% weight to vegetation greenness

Healthy vegetation cover prevents erosion, supports soil organisms, and maintains structure.

0.10 * (1 - bsi_n) → 10% weight to low bare soil exposure

(1 - bsi_n) flips the bare soil index so that less bare soil = higher score. Low bare soil reduces erosion risk and nutrient loss. Smaller weight here because bare soil impact is already partially captured by NDVI and NDMI.

Score range:¶

0 → low soil health (relatively dry, sparse vegetation, high bare soil).

1 → high soil health (more moisture, greener vegetation, low bare soil exposure).

def calculate_soil_health_score(
    ndmi: xr.DataArray,
    ndvi: xr.DataArray,
    bsi: xr.DataArray,
    weight_ndmi: float = 0.45,
    weight_ndvi: float = 0.45,
    weight_bsi: float = 0.10
) -> xr.DataArray:
    """
    Calculate a soil health score from NDMI, NDVI, and BSI DataArrays.
    
    The score is a weighted combination of:
    - NDMI (Normalized Difference Moisture Index)
    - NDVI (Normalized Difference Vegetation Index)
    - BSI (Bare Soil Index), inverted so higher vegetation cover improves score
    
    All indices are automatically normalized to 0–1 range before weighting,
    ignoring NaNs in the normalization calculations.

    Parameters
    ----------
    ndmi : xr.DataArray
        Raw NDMI values.
    ndvi : xr.DataArray
        Raw NDVI values.
    bsi : xr.DataArray
        Raw BSI values.
    weight_ndmi : float, optional
        Weight for NDMI in the score (default 0.45).
    weight_ndvi : float, optional
        Weight for NDVI in the score (default 0.45).
    weight_bsi : float, optional
        Weight for inverted BSI in the score (default 0.10).

    Returns
    -------
    xr.DataArray
        Soil health score scaled between 0 and 1, with NaNs preserved.
    """

    def normalize(da: xr.DataArray) -> xr.DataArray:
        """Normalize a DataArray to the 0–1 range ignoring NaNs."""
        da_min = da.min(skipna=True)
        da_max = da.max(skipna=True)
        return (da - da_min) / (da_max - da_min)

    # Normalize each index (NaNs preserved)
    ndmi_n = normalize(ndmi)
    ndvi_n = normalize(ndvi)
    bsi_n = normalize(bsi)

    # Compute weighted soil health score
    soil_health_score = (
        weight_ndmi * ndmi_n +
        weight_ndvi * ndvi_n +
        weight_bsi * (1 - bsi_n)  # invert BSI so higher vegetation improves score
    )

    # Normalize final score ignoring NaNs
    return normalize(soil_health_score)

score_da = calculate_soil_health_score(s2_data_composite_masked['ndmi'], 
                                       s2_data_composite_masked['ndvi'], 
                                       s2_data_composite_masked['bsi'])

score_da.plot()

s2_data_composite_masked[["red", "green", "blue"]].to_array("band").plot.imshow(rgb="band", robust=True, size=5, aspect=1)

Extra: Add a DEM¶

Load a DEM to understand elevation’s interplay with soil conditions. Some primary derivatives of a DEM that would be useful here is slope and aspect.

# Connect to Microsoft Planetary Computer STAC
stac_client = Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")

# Search NASA DEM
search = stac_client.search(
    collections=["nasadem"],
    bbox=selected.bounds,
    max_items=1
)

dem_item = planetary_computer.sign(search.item_collection()[0])

dem = odc.stac.load(
    items=[dem_item],
    crs="EPSG:32759", #4326",  # Match with other layers
    bbox=selected.bounds,  # Same bounding box as the composite
)

# Check output ranges
vals = dem["elevation"].isel(valid_time=0).values
vals_clean = vals[np.isfinite(vals)]

print("Unique:", np.unique(vals_clean))
print("Min:", vals_clean.min())
print("Max:", vals_clean.max())

s2_data_composite = s2_data_composite.assign(elevation=dem["elevation"])