The solution I came up with is to separately find the indices where the first occurrence of 1 is and the indices where the last occurrence of 1 is.
def get_indices2(arr):
value_diff = np.diff(arr, prepend=0, append=0)
start_idx = np.nonzero(value_diff == 1)[0]
end_idx = np.nonzero(value_diff == -1)[0] - 1 # -1 to include endpoint
idx = np.stack((start_idx, end_idx), axis=-1)
return idx
Note that the result is not a list of tuples, but a 2D numpy array like the one below.
array([[0, 0],
[3, 5],
[7, 9]])
Here is benchmark:
import timeit
import numpy as np
def get_indices(array):
indices = []
xstart = None
for x, col in enumerate(array):
if col == 0 and xstart is not None:
indices.append((xstart, x - 1))
xstart = None
elif col == 1 and xstart is None:
xstart = x
if xstart is not None:
indices.append((xstart, x))
return indices
def get_indices2(arr):
value_diff = np.diff(arr, prepend=0, append=0)
start_idx = np.nonzero(value_diff == 1)[0]
end_idx = np.nonzero(value_diff == -1)[0] - 1 # -1 to include endpoint
idx = np.stack((start_idx, end_idx), axis=-1)
return idx
def benchmark():
rng = np.random.default_rng(0)
arr = rng.integers(0, 1, endpoint=True, size=20_000_000)
expected = np.asarray(get_indices(arr))
for f in [get_indices, get_indices2]:
t = np.asarray(f(arr))
assert expected.shape == t.shape and np.array_equal(expected, t), f.__name__
elapsed = min(timeit.repeat(lambda: f(arr), repeat=10, number=1))
print(f"{f.__name__:25}: {elapsed}")
benchmark()
Result:
get_indices : 4.708652864210308
get_indices2 : 0.21052680909633636
One thing that concerns me is that on my PC, your function takes less than 5 seconds to process 20 million elements, while you mention that it takes 8 seconds to process 2 million elements. So I may be missing something.
Update
Matt provided an elegant solution using reshape in his answer. However, if performance is important, I would suggest optimizing the np.diff
part first.
def custom_int8_diff(arr):
out = np.empty(len(arr) + 1, dtype=np.int8)
out[0] = arr[0]
out[-1] = -arr[-1]
np.subtract(arr[1:], arr[:-1], out=out[1:-1])
return out
def get_indices2_custom_diff(arr):
mask = custom_int8_diff(arr) # Use custom diff. Others unchanged.
start_idx = np.nonzero(mask == 1)[0]
end_idx = np.nonzero(mask == -1)[0] - 1
return np.stack((start_idx, end_idx), axis=-1)
For Matt's reshape solution, we can use logical_xor
, which is even faster.
def custom_bool_diff(arr):
out = np.empty(len(arr) + 1, dtype=np.bool_)
out[0] = arr[0]
out[-1] = arr[-1]
np.logical_xor(arr[1:], arr[:-1], out=out[1:-1])
return out
def get_indices3_custom_diff(arr):
value_diff = custom_bool_diff(arr) # Use custom diff. Others unchanged.
idx = np.nonzero(value_diff)[0]
idx[1::2] -= 1
return idx.reshape(-1, 2)
Benchmark (2 million elements):
get_indices : 0.463582425378263
get_indices2 : 0.01675519533455372
get_indices3 : 0.01814895775169134
get_indices2_custom_diff : 0.010258681140840054
get_indices3_custom_diff : 0.006368924863636494
Benchmark (20 million elements):
get_indices : 4.708652864210308
get_indices2 : 0.21052680909633636
get_indices3 : 0.19463363010436296
get_indices2_custom_diff : 0.14093663357198238
get_indices3_custom_diff : 0.08207075204700232