# -*- coding: utf-8 -*-
"""General functions for manipulating geographical data.
"""
from numbers import Number
import imageio
import numpy as np
from sklearn.neighbors import BallTree, KDTree
from typhon.constants import earth_radius
from typhon.geodesy import geocentric2cart
from typhon.utils import split_units
__all__ = [
'area_weighted_mean',
'GeoIndex',
'gridded_mean',
'sea_mask'
]
[docs]def area_weighted_mean(lon, lat, data):
"""Calculate the mean of gridded data on a sphere.
Data points on the Earth's surface are often represented as a grid. As the
grid cells do not have a constant area they have to be weighted when
calculating statistical properties (e.g. mean).
This function returns the weighted mean assuming a perfectly spherical
globe.
Parameters:
lon (ndarray): Longitude (M) angles [degree].
lat (ndarray): Latitude (N) angles [degree].
data ()ndarray): Data array (N x M).
Returns:
float: Area weighted mean.
"""
# Calculate coordinates and steradian (in rad).
lon = np.deg2rad(lon)
lat = np.deg2rad(lat)
dlon = np.diff(lon)
dlat = np.diff(lat)
# Longitudal mean
middle_points = (data[:, 1:] + data[:, :-1]) / 2
norm = np.sum(dlon)
lon_integral = np.sum(middle_points * dlon, axis=1) / norm
# Latitudal mean
lon_integral *= np.cos(lat) # Consider varying grid area (N-S).
middle_points = (lon_integral[1:] + lon_integral[:-1]) / 2
norm = np.sum(np.cos((lat[1:] + lat[:-1]) / 2) * dlat)
return np.sum(middle_points * dlat) / norm
# Factor to convert a length unit to kilometers
UNITS_CONVERSION_FACTORS = [
[{"cm", "centimeter", "centimeters"}, 1e-6],
[{"m", "meter", "meters"}, 1e-3],
[{"km", "kilometer", "kilometers"}, 1],
[{"mi", "mile", "miles"}, 1.609344], # english statute mile
[{"yd", "yds", "yard", "yards"}, 0.9144e-3],
[{"ft", "foot", "feet"}, 0.3048e-3],
]
def to_kilometers(distance):
"""Convert different length units to kilometers
Args:
distance: A string or number.
Returns:
A distance as float in kilometers
"""
if isinstance(distance, Number):
return distance
elif not isinstance(distance, str):
raise ValueError("Distance must be a number or a string!")
length, unit = split_units(distance)
if length == 0:
raise ValueError("A valid distance length must be given!")
if not unit:
return length
for units, factor in UNITS_CONVERSION_FACTORS:
if unit in units:
return length * factor
raise ValueError(f"Unknown distance unit: {unit}!")
class GeoIndex:
"""Indexer that allows fast range queries with geographical coordinates"""
def __init__(self, lat, lon, metric=None, tree_class=None,
shuffle=True, **tree_kwargs):
"""Initialize a GeoIndex
Args:
lat: Latitudes between -90 and 90 degrees as 1-dimensional numpy
array.
lon: Longitudes between -180 and 180 degrees as 1-dimensional numpy
array. Must have the same length as `lat`.
metric: We can use different metrics for the indexing. The default
*minkowski* metric is the fastest but produces a big error for
large distances. The *haversine* is more correct but is slower
to compute. Rule of thumb: When searching for radii over
1000km, the error is ca. 1.4 kilometers.
tree_class: Say what tree do you want to use for building the
spatial index. Either *Ball* or *KD* is allowed. Default is
*Ball*.
shuffle: The trees have a terrible performance for sorted data
as discussed in this issue:
https://github.com/scikit-learn/scikit-learn/issues/7687. For
sorted, almost-gridded data (such as from SEVIRI) you should
set this to *True*. Default is True.
Examples:
We have two tracks (e.g. of satellites) and we want to know whether
and where they have been in a close distance.
.. code-block:: python
import matplotlib.pyplot as plt
import numpy as np
from typhon.geographical import GeoIndex
from typhon.plots import worldmap
track1 = {
"lat": 90*np.sin(np.linspace(-2*np.pi, 2*np.pi, 90)),
"lon": np.linspace(-180, 180, 90),
}
track2 = {
"lat": 90*np.cos(np.linspace(-2*np.pi, 2*np.pi, 90)),
"lon": np.linspace(-180, 180, 90),
}
# Build the index with the data of the first track:
index = GeoIndex(track1["lat"], track1["lon"])
# Find all points from the second track that are within a
# radius of 500 miles to the first track
pairs, distances = index.query(
track2["lat"], track2["lon"], r="500 miles"
)
# Plot the points
worldmap(
track1["lat"], track1["lon"],
s=10, bg=True, label="Track 1")
worldmap(
track2["lat"], track2["lon"], s=10, label="Track 2")
worldmap(
track1["lat"][pairs[0]], track1["lon"][pairs[0]],
s=10, label="Track 1 (matched)"
)
worldmap(
track2["lat"][pairs[1]], track2["lon"][pairs[1]],
s=10, label="Track 2 (matched)"
)
plt.legend()
"""
if metric is None:
self.metric = "minkowski"
else:
self.metric = metric
# Convert the lat and lon to the chosen metric:
points = self._to_metric(lat, lon)
# Save the latitudes and longitudes:
self.lat = lat
self.lon = lon
if tree_class is None or tree_class == "Ball":
tree_class = BallTree
elif tree_class == "KD":
tree_class = KDTree
# KD- or ball trees have a very poor building performance for sorted
# data (such as from SEVIRI) as discussed in this issue:
# https://github.com/scikit-learn/scikit-learn/issues/7687
# Hence, we shuffle the data points before inserting them.
if shuffle:
self.shuffler = np.arange(points.shape[0])
np.random.shuffle(self.shuffler)
points = points[self.shuffler]
else:
# The user does not want to shuffle
self.shuffler = None
self.tree = tree_class(
points, **{**tree_kwargs, "metric": self.metric}
)
def __getitem__(self, points):
"""Get nearest indices for given points.
Warnings:
Not yet implemented!
Args:
points:
Returns:
"""
pass
def _to_metric(self, lat, lon):
if not (isinstance(lat, np.ndarray) or isinstance(lon, np.ndarray)):
raise ValueError("lat and lon must be numpy.ndarray objects (no "
"pandas.Series or xarray.DataArray)!")
if self.metric == "minkowski":
return np.column_stack(
geocentric2cart(earth_radius, lat, lon)
)
elif self.metric == "haversine":
return np.radians(
np.column_stack([lat, lon])
)
else:
raise ValueError(f"Unknown metric '{self.metric}!'")
def query(self, lat, lon, r, return_distance=True):
"""Find all neighbours within a radius of query points
Args:
lat: Latitudes between -90 and 90 degrees as 1-dimensional numpy
array, list or single number.
lon: Longitudes between -180 and 180 degrees as 1-dimensional numpy
array, list or single number. Must have the same length as
`lat`.
r: Radius in kilometers (if number is given). You can also use
another unit if you pass this as string (e.g. *'10 miles'*).
Despite of the unit which is given here, the returned distances
will always be in kilometers.
return_distance: If True, the distances will be returned. Otherwise
not and the query will be faster. Default is true.
Returns:
Two numpy arrays: *pairs* and *distances*. The first has a
*2xN* shape, where N is the number of found collocations. The first
row contains the indices of the points with which the GeoIndex was
built. The second row contains the matches in the query points.
*distances* is a numpy array with distances in kilometers.
"""
points = self._to_metric(lat, lon)
# The user passes the radius in kilometers but we calculate with meters
# internally
r = to_kilometers(r)
if self.metric == "minkowski":
r *= 1000.
elif self.metric == "haversine":
r *= 1000. / earth_radius
results = self.tree.query_radius(
points, r, return_distance=return_distance
)
if return_distance:
jagged_pairs, jagged_distances = results
else:
jagged_pairs = results
# Build the list of the collocation pairs:
pairs = np.array([
[build_point, query_point]
for query_point, build_points in enumerate(jagged_pairs)
for build_point in build_points
]).T
if not return_distance:
return pairs
if not pairs.any():
return pairs, pairs
distances = np.hstack([
distances_to_query
for distances_to_query in jagged_distances
])
# Return the distances in kilometers
distances /= 1000.
if self.shuffler is None:
return pairs, distances
else:
# We shuffled the build points in the beginning, so the current
# indices in the second row (the collocation indices from the build
# points) are not correct
pairs[0, :] = self.shuffler[pairs[0, :]]
return pairs, distances
def gridded_mean(lat, lon, data, grid):
"""Grid data along latitudes and longitudes
Args:
lat: Grid points along latitudes as 1xN dimensional array.
lon: Grid points along longitudes as 1xM dimensional array.
data: The data as NxM numpy array.
grid: A tuple with two numpy arrays, consisting of latitude and
longitude grid points.
Returns:
Two matrices in grid form: the mean and the number of points of `data`.
"""
grid_sum, _, _ = np.histogram2d(lat, lon, grid, weights=data)
grid_number, _, _ = np.histogram2d(lat, lon, grid)
return grid_sum / grid_number, grid_number
def sea_mask(lat, lon, mask):
"""Check whether geographical coordinates are over sea
Notes:
This uses per default a land sea mask with a grid size of 5 minutes.
You have to decide by yourself whether it is sufficient for your data.
Args:
lat: Latitudes between -90 and 90 degrees as 1-dimensional numpy
array.
lon: Longitudes between -180 and 180 degrees as 1-dimensional numpy
array. Must have the same length as `lat`.
mask: Your own land-sea mask as a 2-dimensional boolean matrix or
a path to a monochromatic PNG file. Must be sea pixels must be True
or white, respectively.
Returns:
Returns a boolean array with the same dimensions as `lat`. It is True
where the coordinate is over the sea.
Examples:
.. code-block:: python
lat = np.array([60])
lon = np.array([-0])
is_over_sea(lat, lon, "land_water_mask_5min.png")
"""
def _to_array(item):
if isinstance(item, np.ndarray):
return item
if isinstance(item, Number):
return np.array([item])
else:
return np.array(item)
lat = _to_array(lat)
lon = _to_array(lon)
if not lat.size or not lon.size:
return np.array([])
if lon.min() < -180 or lon.max() > 180:
raise ValueError("Longitudes out of bounds!")
if lat.min() < -90 or lat.max() > 90:
raise ValueError("Latitudes out of bounds!")
if isinstance(mask, str):
mask = np.flip(np.array(imageio.imread(mask) == 255), axis=0)
mask_lat_step = 180 / (mask.shape[0] - 1)
mask_lon_step = 360 / (mask.shape[1] - 1)
lat_cell = (90 - lat) / mask_lat_step
lon_cell = lon / mask_lon_step
return mask[lat_cell.astype(int), lon_cell.astype(int)]