我想提出一个使用numba的解决方案,应该很容易理解。我假设您要“屏蔽”连续的重复项:
import numpy as np
import numba as nb
@nb.njit
def mask_more_n(arr, n):
mask = np.ones(arr.shape, np.bool_)
current = arr[0]
count = 0
for idx, item in enumerate(arr):
if item == current:
count += 1
else:
current = item
count = 1
mask[idx] = count <= n
return mask
例如:
>>> bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
>>> bins[mask_more_n(bins, 3)]
array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
>>> bins[mask_more_n(bins, 2)]
array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5])
性能:
使用simple_benchmark
-但是我还没有包括所有方法。这是对数-对数比例:
似乎numba解决方案无法胜过Paul Panzer的解决方案,后者对于大型阵列而言似乎要快一点(并且不需要其他依赖项)。
但是,两者似乎都胜过其他解决方案,但是它们确实返回掩码而不是“过滤”数组。
import numpy as np
import numba as nb
from simple_benchmark import BenchmarkBuilder, MultiArgument
b = BenchmarkBuilder()
bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
@nb.njit
def mask_more_n(arr, n):
mask = np.ones(arr.shape, np.bool_)
current = arr[0]
count = 0
for idx, item in enumerate(arr):
if item == current:
count += 1
else:
current = item
count = 1
mask[idx] = count <= n
return mask
@b.add_function(warmups=True)
def MSeifert(arr, n):
return mask_more_n(arr, n)
from scipy.ndimage.morphology import binary_dilation
@b.add_function()
def Divakar_1(a, N):
k = np.ones(N,dtype=bool)
m = np.r_[True,a[:-1]!=a[1:]]
return a[binary_dilation(m,k,origin=-(N//2))]
@b.add_function()
def Divakar_2(a, N):
k = np.ones(N,dtype=bool)
return a[binary_dilation(np.ediff1d(a,to_begin=a[0])!=0,k,origin=-(N//2))]
@b.add_function()
def Divakar_3(a, N):
m = np.r_[True,a[:-1]!=a[1:],True]
idx = np.flatnonzero(m)
c = np.diff(idx)
return np.repeat(a[idx[:-1]],np.minimum(c,N))
from skimage.util import view_as_windows
@b.add_function()
def Divakar_4(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
idx = np.flatnonzero(m)
v = idx<len(w)
w[idx[v]] = 1
if v.all()==0:
m[idx[v.argmin()]:] = 1
return a[m]
@b.add_function()
def Divakar_5(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
last_idx = len(a)-m[::-1].argmax()-1
w[m[:-N+1]] = 1
m[last_idx:last_idx+N] = 1
return a[m]
@b.add_function()
def PaulPanzer(a,N):
mask = np.empty(a.size,bool)
mask[:N] = True
np.not_equal(a[N:],a[:-N],out=mask[N:])
return mask
import random
@b.add_arguments('array size')
def argument_provider():
for exp in range(2, 20):
size = 2**exp
yield size, MultiArgument([np.array([random.randint(0, 5) for _ in range(size)]), 3])
r = b.run()
import matplotlib.pyplot as plt
plt.figure(figsize=[10, 8])
r.plot()