Now that we've explored several methods to convert a list to a set while preserving order, let's compare their performance and establish some best practices.
Create a new file named performance_test.py
:
import time
from collections import OrderedDict
def method1_dict(data):
"""Using dict.fromkeys()"""
return list(dict.fromkeys(data))
def method2_ordereddict(data):
"""Using OrderedDict.fromkeys()"""
return list(OrderedDict.fromkeys(data))
def method3_loop(data):
"""Using a loop and a set"""
result = []
seen = set()
for item in data:
if item not in seen:
seen.add(item)
result.append(item)
return result
def time_function(func, data, runs=100):
"""Measure execution time of a function"""
start_time = time.time()
for _ in range(runs):
func(data)
end_time = time.time()
return (end_time - start_time) / runs
## Test data
small_list = list(range(100)) + list(range(50)) ## 150 items, 50 duplicates
medium_list = list(range(1000)) + list(range(500)) ## 1500 items, 500 duplicates
large_list = list(range(10000)) + list(range(5000)) ## 15000 items, 5000 duplicates
## Test results
print("Performance comparison (average time in seconds over 100 runs):\n")
print("Small list (150 items, 50 duplicates):")
print(f"dict.fromkeys(): {time_function(method1_dict, small_list):.8f}")
print(f"OrderedDict.fromkeys(): {time_function(method2_ordereddict, small_list):.8f}")
print(f"Loop and set: {time_function(method3_loop, small_list):.8f}")
print("\nMedium list (1,500 items, 500 duplicates):")
print(f"dict.fromkeys(): {time_function(method1_dict, medium_list):.8f}")
print(f"OrderedDict.fromkeys(): {time_function(method2_ordereddict, medium_list):.8f}")
print(f"Loop and set: {time_function(method3_loop, medium_list):.8f}")
print("\nLarge list (15,000 items, 5,000 duplicates):")
print(f"dict.fromkeys(): {time_function(method1_dict, large_list):.8f}")
print(f"OrderedDict.fromkeys(): {time_function(method2_ordereddict, large_list):.8f}")
print(f"Loop and set: {time_function(method3_loop, large_list):.8f}")
Run the performance test:
python3 performance_test.py
The output will show the performance of each method with different list sizes:
Performance comparison (average time in seconds over 100 runs):
Small list (150 items, 50 duplicates):
dict.fromkeys(): 0.00000334
OrderedDict.fromkeys(): 0.00000453
Loop and set: 0.00000721
Medium list (1,500 items, 500 duplicates):
dict.fromkeys(): 0.00003142
OrderedDict.fromkeys(): 0.00004123
Loop and set: 0.00007621
Large list (15,000 items, 5,000 duplicates):
dict.fromkeys(): 0.00035210
OrderedDict.fromkeys(): 0.00044567
Loop and set: 0.00081245
The actual numbers may vary depending on your system, but you should notice some patterns.
Best Practices
Based on our experiments, let's establish some best practices. Create a file named best_practices.py
:
"""
Best Practices for Converting a List to a Set While Preserving Order
"""
## Example 1: For Python 3.7+, use dict.fromkeys() for best performance
def preserve_order_modern(lst):
"""Best method for Python 3.7+ - using dict.fromkeys()"""
return list(dict.fromkeys(lst))
## Example 2: For compatibility with older Python versions, use OrderedDict
from collections import OrderedDict
def preserve_order_compatible(lst):
"""Compatible method for all Python versions - using OrderedDict"""
return list(OrderedDict.fromkeys(lst))
## Example 3: When you need to process elements while preserving order
def preserve_order_with_processing(lst):
"""Process elements while preserving order"""
result = []
seen = set()
for item in lst:
## Option to process the item here
processed_item = str(item).lower() ## Example processing
if processed_item not in seen:
seen.add(processed_item)
result.append(item) ## Keep original item in the result
return result
## Demo
data = ["Apple", "banana", "Orange", "apple", "Pear", "BANANA"]
print("Original list:", data)
print("Method 1 (Python 3.7+):", preserve_order_modern(data))
print("Method 2 (Compatible):", preserve_order_compatible(data))
print("Method 3 (With processing):", preserve_order_with_processing(data))
Run the file:
python3 best_practices.py
The output shows how each method handles the data:
Original list: ['Apple', 'banana', 'Orange', 'apple', 'Pear', 'BANANA']
Method 1 (Python 3.7+): ['Apple', 'banana', 'Orange', 'apple', 'Pear', 'BANANA']
Method 2 (Compatible): ['Apple', 'banana', 'Orange', 'apple', 'Pear', 'BANANA']
Method 3 (With processing): ['Apple', 'Orange', 'Pear']
Notice that Method 3 considers "Apple" and "apple" as the same item due to the lowercase processing.
Recommendations
Based on our experiments, here are some recommendations:
- For Python 3.7 and later, use
dict.fromkeys()
for the best performance.
- For compatibility with all Python versions, use
OrderedDict.fromkeys()
.
- When you need to perform custom processing while checking for duplicates, use the loop and set approach.
- Consider case-sensitivity and other transformations based on your specific requirements.