Question

Joining lat/lon data frames by nearest distance

Let's say I have a regular latitude/longitude grid and data at irregular locations, like this:

grid = tidyr::crossing(lon = seq(0, 1, 0.25), lat = seq(0, 1, 0.25))
data = tibble::tibble(lon = runif(4), lat=runif(4), y=rnorm(4))

How do I use, for example, dplyr::inner_join and join_by to join these data frames so that I get y values from data and corresponding lat and lon values from grid from the nearest location, i.e. the grid point with smallest (grid$lon - data$lon)^2 + (grid$lat - data$lat)^2 for each row in data?

 3  72  3
1 Jan 1970

Solution

 2

The package sf is made to manipulate spatial geometries; ex. points, lines, polygones. You need to convert the dataframes as sf objects, then you can specify a spatial join st_join() with join = st_nearest_feature as argument.

library(sf)
library(tidyverse)

set.seed(42)

grid <- tidyr::crossing(lon = seq(0, 1, 0.25), lat = seq(0, 1, 0.25))
data <- tibble::tibble(lon = runif(4), lat = runif(4), y = rnorm(4))

grid_sf = st_as_sf(grid , coords =c("lon","lat"))
data_sf = st_as_sf(data , coords =c("lon","lat"))

joined = st_join(grid_sf, data_sf, join = st_nearest_feature)

ggplot() + geom_sf(data= joined, aes(col = y))+ 
  geom_sf(data= data_sf, aes(col = y, fill = y),size= 4, shape = 22)

Created on 2024-07-12 with reprex v2.1.0

2024-07-12
VinceGreg

Solution

 1

You can merge the 2 dataframes and search for lowest distance per coordinates group:

grid = tidyr::crossing(lon_grid = seq(0, 1, 0.25), lat_grid = seq(0, 1, 0.25))
data = tibble::tibble(lon = runif(4), lat=runif(4), y=rnorm(4))

library(dplyr)
tidyr::expand_grid(data, grid) %>%
  group_by(lon, lat) %>%
  filter(row_number() == which.min((lon_grid - lon) ^ 2 + (lat_grid - lat) ^ 2)) %>%
  ungroup()


# A tibble: 4 × 5
    lon    lat       y lon_grid lat_grid
  <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
1 0.612 0.208   0.0407     0.5      0.25
2 0.381 0.520   2.01       0.5      0.5 
3 0.198 0.399  -0.455      0.25     0.5 
4 0.150 0.0847 -0.717      0.25     0  

For large dataframes, you can filter on points that have coordinates at resolution to reduce group numbers:

data %>%
  rowwise() %>%
  mutate(grid = list(grid[(abs(grid$lon_grid - lon) < 0.25 & abs(grid$lat_grid - lat) < 0.25), ])) %>%
  tidyr::unnest(grid) %>%
  group_by(lon, lat) %>%
  filter(row_number() == which.min((lon_grid - lon) ^ 2 + (lat_grid - lat) ^ 2)) %>%
  ungroup()
2024-07-12
Clemsang

Solution

 1

If the grid is uniform you can simply round to the grid granularity.

set.seed(42)

grid <- tidyr::crossing(lon = seq(0, 1, 0.25), lat = seq(0, 1, 0.25))
data <- tibble::tibble(lon = runif(4), lat = runif(4), y = rnorm(4))

round_to <- function(x, precision) {
  precision * round(x / precision)
}

data |>
  dplyr::mutate(
    lon_grid = round_to(lon, 0.25),
    lat_grid = round_to(lat, 0.25),
  )
#> # A tibble: 4 × 5
#>     lon   lat       y lon_grid lat_grid
#>   <dbl> <dbl>   <dbl>    <dbl>    <dbl>
#> 1 0.915 0.642  0.404      1        0.75
#> 2 0.937 0.519 -0.106      1        0.5 
#> 3 0.286 0.737  1.51       0.25     0.75
#> 4 0.830 0.135 -0.0947     0.75     0.25

For joining arbitrary points, you’ll need to calculate all the pairwise distances. That can be done efficiently with Rfast::dista().

data |>
  dplyr::mutate(
    grid_index = dplyr::pick(lon, lat) |>
      Rfast::dista(grid[c("lon", "lat")]) |>
      Rfast::rowMins(),
    grid[grid_index, ] |> dplyr::rename_all(paste0, "_grid")
  )
#> # A tibble: 4 × 6
#>     lon   lat       y grid_index lon_grid lat_grid
#>   <dbl> <dbl>   <dbl>      <dbl>    <dbl>    <dbl>
#> 1 0.915 0.642  0.404          24     1        0.75
#> 2 0.937 0.519 -0.106          23     1        0.5 
#> 3 0.286 0.737  1.51            9     0.25     0.75
#> 4 0.830 0.135 -0.0947         17     0.75     0.25
2024-07-12
Mikko Marttila

Solution

 1

Here is a base R approach using max.col + outer

cp_grid <- with(grid, lon + 1i * lat)
cp_data <- with(data, lon + 1i * lat)
cbind(
    data,
    setNames(
        grid,
        paste0("grid_", names(grid))
    )[max.col(-abs(outer(cp_data, cp_grid, `-`))), ]
)

which gives result looking like

        lon       lat          y grid_lon grid_lat
1 0.8966972 0.9082078  0.4146414     1.00     1.00
2 0.2655087 0.2016819 -1.5399500     0.25     0.25
3 0.3721239 0.8983897 -0.9285670     0.25     1.00
4 0.5728534 0.9446753 -0.2947204     0.50     1.00

Data

set.seed(0)
grid <- tidyr::crossing(lon = seq(0, 1, 0.25), lat = seq(0, 1, 0.25))
data <- tibble::tibble(lon = runif(4), lat = runif(4), y = rnorm(4))
2024-07-12
ThomasIsCoding