Question

How to compare rows within the same csv file faster

I have a csv file containing 720,000 rows with and 10 columns, the columns that are relevant to the problem are ['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'] This File is logs of items people loot of the ground in a game , the problem is that sometimes the loot logger of the ground bugs and types in the person looting the same item twice in two different rows (those two rows could be separated by up to 5 rows) with a slight difference in the 'timestamp_utc' Column other wise ['looted_by__name', 'item_id', 'quantity'] are the same and example of this would be:

2024-06-23T11:40:43.2187312Z,Georgeeto,T4_SOUL,2

2024-06-23T11:40:43.4588316Z,Georgeeto,T4_SOUL,2

where in this example here the 2024-06-23T11:40:43.2187312Z would be the 'timestamp_utc'

'Georgeeto' would be the 'looted_by__name'

'T4_SOUL' would be the 'item_id'

'2' would be the 'quantity'

what am trying to do here is see if ['looted_by__name', 'item_id', 'quantity'] are equal in both rows and if they are subtract both rows time stamps from one another , and if it is less that 0.5 secs i copy both corrupted lines into a Corrupted.csv file and only put one of the lines in a Clean.csv file

the way i went about doing this is the following

import pandas as pd
import time
from datetime import datetime

start_time = time.time()
combined_df_3 = pd.read_csv("Proccesing/combined_file_refined.csv", delimiter= ',', usecols=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'])
combined_df_4 = pd.read_csv("Proccesing/combined_file_refined.csv", delimiter= ',', usecols=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'])
bugged_item_df = pd.DataFrame()
clean_item_df = pd.DataFrame()

bugged_item_list = []
clean_item_list = []

date_format = '%Y-%m-%dT%H:%M:%S.%f'

for index1,row1 in combined_df_3.iterrows():
    n = 0 
    time_stamp_1 = datetime.strptime(row1['timestamp_utc'][:26], date_format)
    name_1 = row1['looted_by__name']
    item_id_1 = row1['item_id']
    quantity_1 = row1['quantity']
    

    for index2, row2 in combined_df_4.iterrows():
        print(str(n))
        n += 1
        if n > 5:
            break

        time_stamp_2 = datetime.strptime(row2['timestamp_utc'][:26], date_format)
        name_2 = row2['looted_by__name']
        item_id_2 = row2['item_id']
        quantity_2 = row2['quantity']



        if time_stamp_1 == time_stamp_2 and name_1 == name_2 and item_id_1 == item_id_2 and quantity_2 == quantity_2:
            break # get out of for loop here

        elif name_1 == name_2 and item_id_1 == item_id_2 and quantity_1 == quantity_2:
            if time_stamp_1 > time_stamp_2:
                date_diff = abs(time_stamp_1 - time_stamp_2)
                date_diff_sec = date_diff.total_seconds()
            
            elif time_stamp_1 < time_stamp_2:
                date_diff = abs(time_stamp_2 - time_stamp_1)
                date_diff_sec = date_diff.total_seconds()

            if date_diff_sec < 0.5:
                bugged_item_df = bugged_item_df._append(row1 ,ignore_index=True)
                bugged_item_df = bugged_item_df._append(row2 ,ignore_index=True) #add both lines into  a csv file and not write 1 of them into the final csv file
            
            elif date_diff_sec > 0.5:
                pass # type line into a csv file normally
        else:
            pass # type line into a csv file normally

bugged_item_df.to_csv("test.csv", index=False)
clean_item_df.to_csv('test2.csv', index=False)

end_time = time.time()
execution_time = end_time - start_time

print(f"Execution time: {execution_time} seconds")

The way am Doing it 'Technically' works , but it takes about 6-13hrs to go threw the entire file I came to ask if there is a way to optimize it to run faster

note: code is not finished yet but you can get the idea from it

update:Thanks to the advice of AKZ (i love you man) i was able to reduce the time from 13.4hrs to 32mins, and i realised that the code i posted was done wrong in the for loop as well so i went with the following answer

import time
import pandas as pd
from datetime import datetime

#orgnizing the rows
df = pd.read_csv("proccesing/combined_file_refined.csv", delimiter= ',', usecols=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'])
df = df.groupby(['looted_by__name', 'timestamp_utc']).sum().reset_index()
df.to_csv("test.csv", index=False)


bugged_item_df = pd.DataFrame()
clean_item_df = pd.DataFrame()
df1 =pd.read_csv("test.csv", delimiter= ',', usecols=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'])
date_format = '%Y-%m-%dT%H:%M:%S.%f'
n = 0
num_of_runs = 0

start_time = time.time()

for index1,row1 in df.iterrows():
    num_of_runs += 1
    n += 1
    try:
        row2 = df1.iloc[n]
    except IndexError:
        clean_item_df = clean_item_df._append(row1 ,ignore_index=True)
        break
    time_stamp_1 = datetime.strptime(row1['timestamp_utc'][:26], date_format)
    name_1 = row1['looted_by__name']
    item_id_1 = row1['item_id']
    quantity_1 = row1['quantity']
    
    time_stamp_2 = datetime.strptime(row2['timestamp_utc'][:26], date_format)
    name_2 = row2['looted_by__name']
    item_id_2 = row2['item_id']
    quantity_2 = row2['quantity']

    if name_1 != name_2 or item_id_1 != item_id_2 or quantity_1 != quantity_2:
        #add row 1 to df
        continue
    elif time_stamp_1 > time_stamp_2:
        date_diff_1 = abs(time_stamp_1 - time_stamp_2)
        date_diff_sec_1 = date_diff_1.total_seconds()
        if date_diff_sec_1 < 0.5:
            #donot add row 1 to df and add row 1 and row 2 to bugged item list
            bugged_item_df = bugged_item_df._append(row1 ,ignore_index=True)
            bugged_item_df = bugged_item_df._append(row2 ,ignore_index=True)
            pass
        elif date_diff_sec_1 > 0.5:
            clean_item_df = clean_item_df._append(row1 ,ignore_index=True)
            #add row 1 to df
            continue

    elif time_stamp_1 < time_stamp_2:
        date_diff_2 = abs(time_stamp_2 - time_stamp_1)
        date_diff_sec_2 = date_diff_2.total_seconds()
        if date_diff_sec_2 < 0.5:
            bugged_item_df = bugged_item_df._append(row1 ,ignore_index=True)
            bugged_item_df = bugged_item_df._append(row2 ,ignore_index=True)
            #donot add row 1 to df and add row 1 and row 2 to bugged item list
            pass
        elif date_diff_sec_2 > 0.5:
            clean_item_df = clean_item_df._append(row1 ,ignore_index=True)
            #add row 1 to df
            continue
    

bugged_item_df.to_csv("bugged.csv", index=False)
clean_item_df.to_csv("clean.csv", index=False)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

if someone has a better answer than the one i did please post it i will greatly appreciate it

update 2: i edited the code again and realised i could just remove the bugged lines faster now it does it in 60secs

import time
import pandas as pd
from datetime import datetime

#orgnizing the rows
combined_df_3 = pd.read_csv("proccesing/combined_file_refined.csv", delimiter= ',', usecols=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'])
combined_df_3 = combined_df_3.groupby(['looted_by__name', 'timestamp_utc']).sum().reset_index()
combined_df_3.to_csv("proccesing/combined_file_orgnized.csv", index=False)



bugged_item_df = pd.DataFrame()
bugged_item_2df = pd.DataFrame()
combined_df_4 =pd.read_csv("proccesing/combined_file_orgnized.csv", delimiter= ',', usecols=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'])
date_format = '%Y-%m-%dT%H:%M:%S.%f'
num_of_runs = 0


for index1,row1 in combined_df_3.iterrows():
    num_of_runs += 1
    try:
        row2 = combined_df_4.iloc[num_of_runs]
    except IndexError:
        break
    time_stamp_1 = datetime.strptime(row1['timestamp_utc'][:26], date_format)
    name_1 = row1['looted_by__name']
    item_id_1 = row1['item_id']
    quantity_1 = row1['quantity']
    
    time_stamp_2 = datetime.strptime(row2['timestamp_utc'][:26], date_format)
    name_2 = row2['looted_by__name']
    item_id_2 = row2['item_id']
    quantity_2 = row2['quantity']

    if name_1 != name_2 or item_id_1 != item_id_2 or quantity_1 != quantity_2:
        continue
    elif time_stamp_1 > time_stamp_2:
        date_diff_1 = abs(time_stamp_1 - time_stamp_2)
        date_diff_sec_1 = date_diff_1.total_seconds()
        if date_diff_sec_1 < 0.5:
            #donot add row 1 to df and add row 1 and row 2 to bugged item list
            bugged_item_df = bugged_item_df._append(row1 ,ignore_index=True)
            bugged_item_df = bugged_item_df._append(row2 ,ignore_index=True)
            bugged_item_2df = bugged_item_2df._append(row1,ignore_index=True)
            

    elif time_stamp_1 < time_stamp_2:
        date_diff_2 = abs(time_stamp_2 - time_stamp_1)
        date_diff_sec_2 = date_diff_2.total_seconds()
        if date_diff_sec_2 < 0.5:
            bugged_item_df = bugged_item_df._append(row1 ,ignore_index=True)
            bugged_item_df = bugged_item_df._append(row2 ,ignore_index=True)
            bugged_item_2df = bugged_item_2df._append(row1,ignore_index=True)
            #donot add row 1 to df and add row 1 and row 2 to bugged item list
            
    

bugged_item_df.to_csv("bugged.csv", index=False)
print('here')
clean_item_df = combined_df_3.merge(bugged_item_2df, on=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'], how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)
clean_item_df.to_csv("clean.csv", index=False)

if someone knows how to improve it beyond 30 secs feel free to add another way

 3  86  3
1 Jan 1970

Solution

 1

Pandas was not designed to iterate rows: Don't iterate rows in Pandas. That answer really goes down the rabbit hole in terms of performance and alternatives, but I think a good takeaway for you would be, that you need a better tool for the job.

Enter Python's csv module and its humble but very fast reader: nothing beats the reader in terms of row-reading performance (and probably will remain that way as the Python contributors have optimized this in C over the years).

reader = csv.reader(open("input.csv", newline=""))
clean_w = csv.writer(open("output-clean.csv", "w", newline=""))
corrupt_w = csv.writer(open("output-corrupt.csv", "w", newline=""))

Granted, a row is just a list of strings, but this actually works to your advantage becaue for this problem you only need to parse one field, the timestamp; the other three fields work fine just as strings because you use them for their identity, not their value—"2" or 2? doesn't matter, this problem doesn't require doing math with the quantity, you only care about the quantity "two". I bring up this idea of identity-vs-value because your Pandas code spends some time parsing "2"2, when "2" works just fine.

I expanded on your sample rows:

2024-06-23T11:40:43.2187312Z,Georgeeto,T4_SOUL,2
2024-06-23T11:40:43.3962342Z,Alicechen,XXYY,3
2024-06-23T11:40:43.4588316Z,Georgeeto,T4_SOUL,2
2024-06-23T11:40:44.5634358Z,Bobbiejjj,AABB,1

With that, I get a clean CSV like:

2024-06-23T11:40:43.2187312Z,Georgeeto,T4_SOUL,2
2024-06-23T11:40:43.3962342Z,Alicechen,XXYY,3
2024-06-23T11:40:44.5634358Z,Bobbiejjj,AABB,1

I ran your final (to date) Pandas code against that input and got a clean CSV like:

Alicechen,2024-06-23T11:40:43.3962342Z,XXYY,3
Bobbiejjj,2024-06-23T11:40:44.5634358Z,AABB,1
Georgeeto,2024-06-23T11:40:43.4588316Z,T4_SOUL,2

Similar. In yours, Georgeeto's row is out of chronological sort.

My corrupt CSV looks like yours:

2024-06-23T11:40:43.2187312Z,Georgeeto,T4_SOUL,2
2024-06-23T11:40:43.4588316Z,Georgeeto,T4_SOUL,2

My program uses a last_seen dict to keep track of a row and its timestamp, keyed to a tuple of username, item, count (or, in your own terms, looted_by__name, item_id, quantity). I made a lightweight dataclass to hold the timestamp and the complete row, and created a type alias for the key:

@dataclass
class Entry:
    ts: datetime
    row: list[str]


Key = tuple[str, str, str]  # combo of username,item,quantity

then, the dict looks like:

last_seen: dict[Key, Entry] = {}

As the reader loops through the rows, it records every row by its key. If the reader has the current row, and that row's key already exists in the dict, there could be a possible duplication, which will be determined by subtracting the two timestamps. If the two rows represent a duplicate (corrupt) entry, the current row gets marked as unclean, and then a final check of the clean flag determines whether to write to the clean or corrupt CSV.

This allows the program to only have to loop over the input once:

  • clean rows are written for every row with a new key, or if the key hasn't been seen in the last 500ms
  • corrupt rows are written for every pair of rows with the same key, within 500ms of each other
max_delta = timedelta(milliseconds=500)

last_seen: dict[Key, Entry] = {}

for row in reader:
    this_ts = datetime.strptime(row[0], "%Y-%m-%dT%H:%M:%S.%f")
    name = row[1]
    item = row[2]
    count = row[3]

    key = (name, item, count)

    clean = True
    last = Entry(datetime(1, 1, 1), [])  # get around "possibly unbound" error

    if key in last_seen:
        last = last_seen[key]
        delta = this_ts - last.ts
        if delta < max_delta:
            clean = False

    if clean:
        clean_w.writerow(row)
    else:
        corrupt_w.writerow(last.row)
        corrupt_w.writerow(row)

    last_seen[key] = Entry(this_ts, row)

I created a 720K-row test file and ran both of our programs against it. Yours ran in 28s and used about 239MB of memory; mine ran in under 4s and used about 14MB of memory. If I swap datetime.strptime(row[0], "%Y-%m-%dT%H:%M:%S.%f") for datetime.fromisoformat(row[0]), that shaves another 2s off mine... down to under 2s (probably because it doesn't have to interpet the format string 720K times).

My complete program:

import csv

from dataclasses import dataclass
from datetime import datetime, timedelta

reader = csv.reader(open("big.csv", newline=""))
clean_w = csv.writer(open("output-clean.csv", "w", newline=""))
corrupt_w = csv.writer(open("output-corrupt.csv", "w", newline=""))

# Copy header from reader to output writers
header = next(reader)
clean_w.writerow(header)
corrupt_w.writerow(header)

# Create small class, and a separate type; for cleaner code w/
# type safety.
@dataclass
class Entry:
    ts: datetime
    row: list[str]

Key = tuple[str, str, str]  # username,item,quantity

# Precompute delta (saves about .2s over 720K iterations)
max_delta = timedelta(milliseconds=500)

# Initialize dict
last_seen: dict[Key, Entry] = {}

# Iterate reader, parse row, check for previous key in
# last_seen and determine clean status, write accordingly,
# save current row to key in last_seen.
for row in reader:
    this_ts = datetime.fromisoformat(row[0])
    name = row[1]
    item = row[2]
    count = row[3]

    key = (name, item, count)

    clean = True
    last = Entry(datetime(1, 1, 1), [])  # get around "possibly unbound" error

    if key in last_seen:
        last = last_seen[key]
        delta = this_ts - last.ts
        if delta < max_delta:
            clean = False

    if clean:
        clean_w.writerow(row)
    else:
        corrupt_w.writerow(last.row)
        corrupt_w.writerow(row)

    last_seen[key] = Entry(this_ts, row)
2024-07-21
Zach Young