Source code for typhon.geographical

# -*- coding: utf-8 -*-

"""General functions for manipulating geographical data.
"""
from numbers import Number

import imageio
import numpy as np
from sklearn.neighbors import BallTree, KDTree
from typhon.constants import earth_radius
from typhon.geodesy import geocentric2cart
from typhon.utils import split_units


__all__ = [
    'area_weighted_mean',
    'GeoIndex',
    'gridded_mean',
    'sea_mask'
]


[docs]def area_weighted_mean(lon, lat, data): """Calculate the mean of gridded data on a sphere. Data points on the Earth's surface are often represented as a grid. As the grid cells do not have a constant area they have to be weighted when calculating statistical properties (e.g. mean). This function returns the weighted mean assuming a perfectly spherical globe. Parameters: lon (ndarray): Longitude (M) angles [degree]. lat (ndarray): Latitude (N) angles [degree]. data ()ndarray): Data array (N x M). Returns: float: Area weighted mean. """ # Calculate coordinates and steradian (in rad). lon = np.deg2rad(lon) lat = np.deg2rad(lat) dlon = np.diff(lon) dlat = np.diff(lat) # Longitudal mean middle_points = (data[:, 1:] + data[:, :-1]) / 2 norm = np.sum(dlon) lon_integral = np.sum(middle_points * dlon, axis=1) / norm # Latitudal mean lon_integral *= np.cos(lat) # Consider varying grid area (N-S). middle_points = (lon_integral[1:] + lon_integral[:-1]) / 2 norm = np.sum(np.cos((lat[1:] + lat[:-1]) / 2) * dlat) return np.sum(middle_points * dlat) / norm
# Factor to convert a length unit to kilometers UNITS_CONVERSION_FACTORS = [ [{"cm", "centimeter", "centimeters"}, 1e-6], [{"m", "meter", "meters"}, 1e-3], [{"km", "kilometer", "kilometers"}, 1], [{"mi", "mile", "miles"}, 1.609344], # english statute mile [{"yd", "yds", "yard", "yards"}, 0.9144e-3], [{"ft", "foot", "feet"}, 0.3048e-3], ] def to_kilometers(distance): """Convert different length units to kilometers Args: distance: A string or number. Returns: A distance as float in kilometers """ if isinstance(distance, Number): return distance elif not isinstance(distance, str): raise ValueError("Distance must be a number or a string!") length, unit = split_units(distance) if length == 0: raise ValueError("A valid distance length must be given!") if not unit: return length for units, factor in UNITS_CONVERSION_FACTORS: if unit in units: return length * factor raise ValueError(f"Unknown distance unit: {unit}!") class GeoIndex: """Indexer that allows fast range queries with geographical coordinates""" def __init__(self, lat, lon, metric=None, tree_class=None, shuffle=True, **tree_kwargs): """Initialize a GeoIndex Args: lat: Latitudes between -90 and 90 degrees as 1-dimensional numpy array. lon: Longitudes between -180 and 180 degrees as 1-dimensional numpy array. Must have the same length as `lat`. metric: We can use different metrics for the indexing. The default *minkowski* metric is the fastest but produces a big error for large distances. The *haversine* is more correct but is slower to compute. Rule of thumb: When searching for radii over 1000km, the error is ca. 1.4 kilometers. tree_class: Say what tree do you want to use for building the spatial index. Either *Ball* or *KD* is allowed. Default is *Ball*. shuffle: The trees have a terrible performance for sorted data as discussed in this issue: https://github.com/scikit-learn/scikit-learn/issues/7687. For sorted, almost-gridded data (such as from SEVIRI) you should set this to *True*. Default is True. Examples: We have two tracks (e.g. of satellites) and we want to know whether and where they have been in a close distance. .. code-block:: python import matplotlib.pyplot as plt import numpy as np from typhon.geographical import GeoIndex from typhon.plots import worldmap track1 = { "lat": 90*np.sin(np.linspace(-2*np.pi, 2*np.pi, 90)), "lon": np.linspace(-180, 180, 90), } track2 = { "lat": 90*np.cos(np.linspace(-2*np.pi, 2*np.pi, 90)), "lon": np.linspace(-180, 180, 90), } # Build the index with the data of the first track: index = GeoIndex(track1["lat"], track1["lon"]) # Find all points from the second track that are within a # radius of 500 miles to the first track pairs, distances = index.query( track2["lat"], track2["lon"], r="500 miles" ) # Plot the points worldmap( track1["lat"], track1["lon"], s=10, bg=True, label="Track 1") worldmap( track2["lat"], track2["lon"], s=10, label="Track 2") worldmap( track1["lat"][pairs[0]], track1["lon"][pairs[0]], s=10, label="Track 1 (matched)" ) worldmap( track2["lat"][pairs[1]], track2["lon"][pairs[1]], s=10, label="Track 2 (matched)" ) plt.legend() """ if metric is None: self.metric = "minkowski" else: self.metric = metric # Convert the lat and lon to the chosen metric: points = self._to_metric(lat, lon) # Save the latitudes and longitudes: self.lat = lat self.lon = lon if tree_class is None or tree_class == "Ball": tree_class = BallTree elif tree_class == "KD": tree_class = KDTree # KD- or ball trees have a very poor building performance for sorted # data (such as from SEVIRI) as discussed in this issue: # https://github.com/scikit-learn/scikit-learn/issues/7687 # Hence, we shuffle the data points before inserting them. if shuffle: self.shuffler = np.arange(points.shape[0]) np.random.shuffle(self.shuffler) points = points[self.shuffler] else: # The user does not want to shuffle self.shuffler = None self.tree = tree_class( points, **{**tree_kwargs, "metric": self.metric} ) def __getitem__(self, points): """Get nearest indices for given points. Warnings: Not yet implemented! Args: points: Returns: """ pass def _to_metric(self, lat, lon): if not (isinstance(lat, np.ndarray) or isinstance(lon, np.ndarray)): raise ValueError("lat and lon must be numpy.ndarray objects (no " "pandas.Series or xarray.DataArray)!") if self.metric == "minkowski": return np.column_stack( geocentric2cart(earth_radius, lat, lon) ) elif self.metric == "haversine": return np.radians( np.column_stack([lat, lon]) ) else: raise ValueError(f"Unknown metric '{self.metric}!'") def query(self, lat, lon, r, return_distance=True): """Find all neighbours within a radius of query points Args: lat: Latitudes between -90 and 90 degrees as 1-dimensional numpy array, list or single number. lon: Longitudes between -180 and 180 degrees as 1-dimensional numpy array, list or single number. Must have the same length as `lat`. r: Radius in kilometers (if number is given). You can also use another unit if you pass this as string (e.g. *'10 miles'*). Despite of the unit which is given here, the returned distances will always be in kilometers. return_distance: If True, the distances will be returned. Otherwise not and the query will be faster. Default is true. Returns: Two numpy arrays: *pairs* and *distances*. The first has a *2xN* shape, where N is the number of found collocations. The first row contains the indices of the points with which the GeoIndex was built. The second row contains the matches in the query points. *distances* is a numpy array with distances in kilometers. """ points = self._to_metric(lat, lon) # The user passes the radius in kilometers but we calculate with meters # internally r = to_kilometers(r) if self.metric == "minkowski": r *= 1000. elif self.metric == "haversine": r *= 1000. / earth_radius results = self.tree.query_radius( points, r, return_distance=return_distance ) if return_distance: jagged_pairs, jagged_distances = results else: jagged_pairs = results # Build the list of the collocation pairs: pairs = np.array([ [build_point, query_point] for query_point, build_points in enumerate(jagged_pairs) for build_point in build_points ]).T if not return_distance: return pairs if not pairs.any(): return pairs, pairs distances = np.hstack([ distances_to_query for distances_to_query in jagged_distances ]) # Return the distances in kilometers distances /= 1000. if self.shuffler is None: return pairs, distances else: # We shuffled the build points in the beginning, so the current # indices in the second row (the collocation indices from the build # points) are not correct pairs[0, :] = self.shuffler[pairs[0, :]] return pairs, distances def gridded_mean(lat, lon, data, grid): """Grid data along latitudes and longitudes Args: lat: Grid points along latitudes as 1xN dimensional array. lon: Grid points along longitudes as 1xM dimensional array. data: The data as NxM numpy array. grid: A tuple with two numpy arrays, consisting of latitude and longitude grid points. Returns: Two matrices in grid form: the mean and the number of points of `data`. """ grid_sum, _, _ = np.histogram2d(lat, lon, grid, weights=data) grid_number, _, _ = np.histogram2d(lat, lon, grid) return grid_sum / grid_number, grid_number def sea_mask(lat, lon, mask): """Check whether geographical coordinates are over sea Notes: This uses per default a land sea mask with a grid size of 5 minutes. You have to decide by yourself whether it is sufficient for your data. Args: lat: Latitudes between -90 and 90 degrees as 1-dimensional numpy array. lon: Longitudes between -180 and 180 degrees as 1-dimensional numpy array. Must have the same length as `lat`. mask: Your own land-sea mask as a 2-dimensional boolean matrix or a path to a monochromatic PNG file. Must be sea pixels must be True or white, respectively. Returns: Returns a boolean array with the same dimensions as `lat`. It is True where the coordinate is over the sea. Examples: .. code-block:: python lat = np.array([60]) lon = np.array([-0]) is_over_sea(lat, lon, "land_water_mask_5min.png") """ def _to_array(item): if isinstance(item, np.ndarray): return item if isinstance(item, Number): return np.array([item]) else: return np.array(item) lat = _to_array(lat) lon = _to_array(lon) if not lat.size or not lon.size: return np.array([]) if lon.min() < -180 or lon.max() > 180: raise ValueError("Longitudes out of bounds!") if lat.min() < -90 or lat.max() > 90: raise ValueError("Latitudes out of bounds!") if isinstance(mask, str): mask = np.flip(np.array(imageio.imread(mask) == 255), axis=0) mask_lat_step = 180 / (mask.shape[0] - 1) mask_lon_step = 360 / (mask.shape[1] - 1) lat_cell = (90 - lat) / mask_lat_step lon_cell = lon / mask_lon_step return mask[lat_cell.astype(int), lon_cell.astype(int)]