GeoPandas：查找其他数据框中的最近点

20

我有2个地理数据框：

import geopandas as gpd
from shapely.geometry import Point
gpd1 = gpd.GeoDataFrame([['John',1,Point(1,1)],['Smith',1,Point(2,2)],['Soap',1,Point(0,2)]],columns=['Name','ID','geometry'])
gpd2 = gpd.GeoDataFrame([['Work',Point(0,1.1)],['Shops',Point(2.5,2)],['Home',Point(1,1.1)]],columns=['Place','geometry'])

我想为gpd1中的每一行找到gpd2中最近点的名称：

desired_output = 

    Name  ID     geometry  Nearest
0   John   1  POINT (1 1)     Home
1  Smith   1  POINT (2 2)    Shops
2   Soap   1  POINT (0 2)     Work

我一直在尝试使用lambda函数使其工作：

gpd1['Nearest'] = gpd1.apply(lambda row: min_dist(row.geometry,gpd2)['Place'] , axis=1)

与

def min_dist(point, gpd2):

    geoseries = some_function()
    return geoseries

— RedM
source

这种方法对我

— 有用

16

您可以直接使用Shapely函数的最近点（GeoSeries的几何是Shapely几何）：

from shapely.ops import nearest_points
# unary union of the gpd2 geomtries 
pts3 = gpd2.geometry.unary_union
def near(point, pts=pts3):
     # find the nearest point and return the corresponding Place value
     nearest = gpd2.geometry == nearest_points(point, pts)[1]
     return gpd2[nearest].Place.get_values()[0]
gpd1['Nearest'] = gpd1.apply(lambda row: near(row.geometry), axis=1)
gpd1
    Name  ID     geometry  Nearest
0   John   1  POINT (1 1)     Home
1  Smith   1  POINT (2 2)    Shops
2   Soap   1  POINT (0 2)     Work

解释

for i, row in gpd1.iterrows():
    print nearest_points(row.geometry, pts3)[0], nearest_points(row.geometry, pts3)[1]
 POINT (1 1) POINT (1 1.1)
 POINT (2 2) POINT (2.5 2)
 POINT (0 2) POINT (0 1.1)

— 基因
source

有些东西对我不起作用，我无法弄清楚。即使几何是实体，该函数也会返回一个空的GeoSeries。例如： sample_point = gpd2.geometry.unary_union[400] / sample_point in gpd2.geometry 这返回True。 gpd2.geometry == sample_point 这全都是假的。

— robroc

除了上述内容：gpd2.geometry.geom_equals(sample_point)作品。

— robroc

13

如果您有大型数据scipy框，我发现cKDTree空间索引.query方法会为最近的邻居搜索返回非常快速的结果。由于它使用空间索引，因此比遍历数据帧然后找到所有距离的最小值要快几个数量级。它也比使用nearest_pointsshapely与RTree（可通过geopandas获得的空间索引方法）更快，因为cKDTree允许您向量化搜索，而另一种方法则不能。

这是一个辅助函数，将从中gpd2的每个点返回距离最近的邻居的距离和“名称” gpd1。假定两个gdf都有一geometry列（点）。

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point

gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)], ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', Point(0, 1.1)], ['Shops', Point(2.5, 2)],
                         ['Home', Point(1, 1.1)]],
                        columns=['Place', 'geometry'])

def ckdnearest(gdA, gdB):
    nA = np.array(list(zip(gdA.geometry.x, gdA.geometry.y)) )
    nB = np.array(list(zip(gdB.geometry.x, gdB.geometry.y)) )
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdf = pd.concat(
        [gdA, gdB.loc[idx, gdB.columns != 'geometry'].reset_index(),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

ckdnearest(gpd1, gpd2)

如果您想找到最接近LineString的点，下面是一个完整的工作示例：

import itertools
from operator import itemgetter

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString

gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)],
                         ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', LineString([Point(100, 0), Point(100, 1)])],
                         ['Shops', LineString([Point(101, 0), Point(101, 1), Point(102, 3)])],
                         ['Home',  LineString([Point(101, 0), Point(102, 1)])]],
                        columns=['Place', 'geometry'])


def ckdnearest(gdfA, gdfB, gdfB_cols=['Place']):
    A = np.concatenate(
        [np.array(geom.coords) for geom in gdfA.geometry.to_list()])
    B = [np.array(geom.coords) for geom in gdfB.geometry.to_list()]
    B_ix = tuple(itertools.chain.from_iterable(
        [itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
    B = np.concatenate(B)
    ckd_tree = cKDTree(B)
    dist, idx = ckd_tree.query(A, k=1)
    idx = itemgetter(*idx)(B_ix)
    gdf = pd.concat(
        [gdfA, gdfB.loc[idx, gdfB_cols].reset_index(drop=True),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

c = ckdnearest(gpd1, gpd2)

— Hu
source

是否可以使用此方法在直线上给出最近的点？例如，将GPS位置捕捉到最近的街道。

— hyperknot

这个答案太神奇了！但是，最接近直线的代码对我来说是一个错误。似乎对于每个点都返回了距最近直线的正确距离，但是返回的线ID错误。我认为它是idx的计算，但是我对Python还是很陌生，所以我无法设法解决这个问题。

— Shakedk

1

弄清楚了：

def min_dist(point, gpd2):
    gpd2['Dist'] = gpd2.apply(lambda row:  point.distance(row.geometry),axis=1)
    geoseries = gpd2.iloc[gpd2['Dist'].argmin()]
    return geoseries

当然，欢迎提出批评。我不喜欢为gpd1的每一行重新计算gpd2 ['Dist']。

— RedM
source

1

吉恩的答案对我不起作用。最终，我发现gpd2.geometry.unary_union生成的几何仅包含我的大约150.000点中的大约30.000。对于遇到相同问题的其他人，这是我解决的方法：

    from shapely.ops import nearest_points
    from shapely.geometry import MultiPoint

    gpd2_pts_list = gpd2.geometry.tolist()
    gpd2_pts = MultiPoint(gpd2_pts_list)
    def nearest(point, gpd2_pts, gpd2=gpd2, geom_col='geometry', src_col='Place'):
         # find the nearest point
         nearest_point = nearest_points(point, gpd2_pts)[1]
         # return the corresponding value of the src_col of the nearest point
         value = gpd2[gpd2[geom_col] == nearest_point][src_col].get_values()[0]
         return value

    gpd1['Nearest'] = gpd1.apply(lambda x: nearest(x.geometry, gpd2_pts), axis=1)

— 因斯克
source

0

对于使用@ JHuw的出色答案同时使用自己的数据进行索引编制错误的任何人，我的问题是我的索引未对齐。重置gdfA和gdfB的索引解决了我的问题，也许这对您也有帮助@ Shakedk。

import itertools
from operator import itemgetter

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString

gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)],
                         ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', LineString([Point(100, 0), Point(100, 1)])],
                         ['Shops', LineString([Point(101, 0), Point(101, 1), Point(102, 3)])],
                         ['Home',  LineString([Point(101, 0), Point(102, 1)])]],
                        columns=['Place', 'geometry'])


def ckdnearest(gdfA, gdfB, gdfB_cols=['Place']):
    # resetting the index of gdfA and gdfB here.
    gdfA = gdfA.reset_index(drop=True)
    gdfB = gdfB.reset_index(drop=True)
    A = np.concatenate(
        [np.array(geom.coords) for geom in gdfA.geometry.to_list()])
    B = [np.array(geom.coords) for geom in gdfB.geometry.to_list()]
    B_ix = tuple(itertools.chain.from_iterable(
        [itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
    B = np.concatenate(B)
    ckd_tree = cKDTree(B)
    dist, idx = ckd_tree.query(A, k=1)
    idx = itemgetter(*idx)(B_ix)
    gdf = pd.concat(
        [gdfA, gdfB.loc[idx, gdfB_cols].reset_index(drop=True),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

c = ckdnearest(gpd1, gpd2)

— 马库斯·罗森菲尔德（Markus Rosenfelder）
source