Note: This is the fixed version of my previous post
http://www.mycpu.org/set-or-unique/. If you are interested in what needed fixing
I recommend you go there first.
Problem:
To find unique elements in a vector quickly with existing std::
routines.
Setup Code:
CMake
So I setup a quick Google Benchmark harness:
cmake_minimum_required(VERSION 3.16)
# Compiler and Standard
set(CMAKE_C_STANDARD 99)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_C_COMPILER clang-8)
set(CMAKE_CXX_COMPILER clang++-8)
# set the project name
set(this_project project_name)
# RUST, Erlang, what else
project(${this_project} C CXX)
add_subdirectory(src)
find_package(benchmark REQUIRED)
target_link_libraries(${this_project} benchmark::benchmark)
set(CMAKE_BUILD_TYPE Release)
set(CMAKE_ENABLE_EXPORTS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
Measure the Truth
Get a few (1000) random numbers
std::vector<int> nums_;
auto helper_(int n) -> std::vector<int> {
auto gen_node_ = [=]() -> int { return rand() % 100; };
while (n--) { nums_.emplace_back(gen_node_()); }
return nums_;
}
Clean up after yourself
nums_.clear();
Candidate Solutions:
std::set
static void BM_Set(benchmark::State& state) {
for (auto _ : state) {
auto n = 1000;
while (n--) { nums_.emplace_back(gen_node_()); }
auto start = std::chrono::high_resolution_clock::now();
std::set<int> s;
for (const auto& e_ : nums_) { s.insert(e_); }
auto end = std::chrono::high_resolution_clock::now();
auto elapsed_seconds =
std::chrono::duration_cast<std::chrono::duration<double>>(
end - start);
state.SetIterationTime(elapsed_seconds.count());
nums_.clear();
}
}
BENCHMARK(BM_Set);
std::unordered_set
static void BM_UnorderedSet(benchmark::State& state) {
for (auto _ : state) {
auto n = 1000;
while (n--) { nums_.emplace_back(gen_node_()); }
auto start = std::chrono::high_resolution_clock::now();
std::unordered_set<int> s;
for (const auto& e_ : nums_) { s.insert(e_); }
auto end = std::chrono::high_resolution_clock::now();
auto elapsed_seconds =
std::chrono::duration_cast<std::chrono::duration<double>>(
end - start);
state.SetIterationTime(elapsed_seconds.count());
nums_.clear();
}
}
BENCHMARK(BM_UnorderedSet);
std::sort
and std::unique
Apparently, std::unique
simply swaps duplicate numbers to the end of the
array as it processes elements. One Possible Impl of std::unique()
is:
template <class ForwardIterator>
ForwardIterator unique (ForwardIterator first, ForwardIterator last)
{
if (first==last) return last;
ForwardIterator result = first;
while (++first != last)
{
if (!(*result == *first)) // or: if (!pred(*result,*first)) for version (2)
*(++result)=*first;
}
return ++result;
}
From this snippet, it’s clear that for us to use std::unique()
we will need
the input to already be sorted. This is additional work but whatever! So let’s
sort it, and since this is a requirement of the input for this algo to work we
will need to account the time taken to sort the input array as well.
static void BM_UniqueSort(benchmark::State& state) {
for (auto _ : state) {
auto n = 1000;
while (n--) { nums_sorted.emplace_back(gen_node_()); }
auto start = std::chrono::high_resolution_clock::now();
std::sort(nums_sorted.begin(), nums_sorted.end());
auto uniq_iter = std::unique(nums_sorted.begin(), nums_sorted.end());
nums_sorted.erase(uniq_iter, nums_sorted.end());
auto end = std::chrono::high_resolution_clock::now();
auto elapsed_seconds =
std::chrono::duration_cast<std::chrono::duration<double>>(
end - start);
state.SetIterationTime(elapsed_seconds.count());
nums_sorted.clear();
}
}
BENCHMARK(BM_UniqueSort);
What If Inputs Are Already Sorted?
std::set
on Sorted Inputs
static void BM_SetSortedInput(benchmark::State& state) {
clear_up_();
for (auto _ : state) {
auto n = 1000;
while (n--) { nums_.emplace_back(gen_node_()); }
std::sort(nums_.begin(), nums_.end());
auto start = std::chrono::high_resolution_clock::now();
std::set<int> s;
for (const auto& e_ : nums_) { s.insert(e_); }
auto end = std::chrono::high_resolution_clock::now();
auto elapsed_seconds =
std::chrono::duration_cast<std::chrono::duration<double>>(
end - start);
state.SetIterationTime(elapsed_seconds.count());
nums_.clear();
}
}
BENCHMARK(BM_SetSortedInput);
static void BM_UnorderedSetSortedInput(benchmark::State& state) {
clear_up_();
for (auto _ : state) {
auto n = 1000;
while (n--) { nums_.emplace_back(gen_node_()); }
std::sort(nums_.begin(), nums_.end());
auto start = std::chrono::high_resolution_clock::now();
std::unordered_set<int> s;
for (const auto& e_ : nums_) { s.insert(e_); }
auto end = std::chrono::high_resolution_clock::now();
auto elapsed_seconds =
std::chrono::duration_cast<std::chrono::duration<double>>(
end - start);
state.SetIterationTime(elapsed_seconds.count());
nums_.clear();
}
}
BENCHMARK(BM_UnorderedSetSortedInput);
static void BM_UniqueSortedInput(benchmark::State& state) {
clear_up_();
for (auto _ : state) {
auto n = 1000;
while (n--) { nums_.emplace_back(gen_node_()); }
std::sort(nums_.begin(), nums_.end());
auto start = std::chrono::high_resolution_clock::now();
auto uniq_iter = std::unique(nums_.begin(), nums_.end());
nums_.erase(uniq_iter, nums_.end());
auto end = std::chrono::high_resolution_clock::now();
auto elapsed_seconds =
std::chrono::duration_cast<std::chrono::duration<double>>(
end - start);
state.SetIterationTime(elapsed_seconds.count());
nums_.clear();
}
}
BENCHMARK(BM_UniqueSortedInput);
Results:
Here are the results:
2021-01-05 23:16:32
Running ./src/practice
Run on (16 X 3700 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x8)
L1 Instruction 64 KiB (x8)
L2 Unified 512 KiB (x8)
L3 Unified 8192 KiB (x2)
Load Average: 1.74, 1.14, 0.74
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
***WARNING*** Library was built as DEBUG. Timings may be affected.
---------------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------------
BM_StringCreation 32.4 ns 32.4 ns 21493232
BM_StringCopy 74.6 ns 74.6 ns 9293176
BM_Set 291289 ns 291217 ns 2295
BM_UnorderedSet 138328 ns 138287 ns 5368
BM_UniqueSort 205645 ns 205554 ns 3515
BM_SetSortedInput 413038 ns 412956 ns 1708
BM_UnorderedSetSortedInput 289016 ns 288956 ns 2414
BM_UniqueSortedInput 197036 ns 196992 ns 3446
Benchmark
Refer: https://quick-bench.com/q/kq7yeDlz9R6HV-0XE37eRiGINYM
Funny thing is that a std::set
and std::unordered_set
perform worse on an
already sorted input.
The best bang for buck appears to be from using std::unordered_set
on unsorted data!
sudo ~/bin/perf stat -e "sched:sched_process*,task:*,L1-dcache-loads,L1-dcache-load-misses,cycles,cs,faults,migrations" -d -d -d ./src/project_name
<-dcache-loads,L1-dcache-load-misses,cycles,cs,faults,migrations" -d -d -d ./src/project_name
2021-01-05 23:18:41
Running ./src/project_name
Run on (16 X 3700 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x8)
L1 Instruction 64 KiB (x8)
L2 Unified 512 KiB (x8)
L3 Unified 8192 KiB (x2)
Load Average: 0.62, 0.97, 0.73
---------------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------------
BM_StringCreation 32.1 ns 32.1 ns 21413542
BM_StringCopy 69.3 ns 69.3 ns 9886036
BM_Set 287445 ns 287279 ns 2395
BM_UnorderedSet 128834 ns 128765 ns 4881
BM_UniqueSort 198459 ns 198352 ns 3642
BM_SetSortedInput 406139 ns 405924 ns 1675
BM_UnorderedSetSortedInput 289739 ns 289561 ns 2444
BM_UniqueSortedInput 193530 ns 193429 ns 3574
Performance counter stats for './src/project_name':
0 sched:sched_process_free
1 sched:sched_process_exit
0 sched:sched_process_wait
0 sched:sched_process_fork
1 sched:sched_process_exec
0 sched:sched_process_hang
0 task:task_newtask
1 task:task_rename
40,684,273,179 L1-dcache-loads (41.65%)
1,721,548 L1-dcache-load-misses # 0.00% of all L1-dcache hits (41.71%)
32,401,365,957 cycles (41.76%)
746 cs
147 faults
2 migrations
40,700,296,235 L1-dcache-loads (41.79%)
1,678,961 L1-dcache-load-misses # 0.00% of all L1-dcache hits (41.79%)
<not supported> LLC-loads
<not supported> LLC-load-misses
4,213,638,640 L1-icache-loads (41.71%)
1,964,508 L1-icache-load-misses # 0.05% of all L1-icache hits (41.66%)
236,237 dTLB-loads (41.61%)
82,233 dTLB-load-misses # 34.81% of all dTLB cache hits (41.58%)
37,522 iTLB-loads (41.58%)
7,523 iTLB-load-misses # 20.05% of all iTLB cache hits (41.58%)
575,582 L1-dcache-prefetches (41.58%)
<not supported> L1-dcache-prefetch-misses
7.665853273 seconds time elapsed
7.644790000 seconds user
0.000000000 seconds sys
My Podcast!
If you like topics such as this then please consider subscribing to my podcast. I talk to some of the stalwarts in tech and ask them what their favorite productivity hacks are:
Available on iTunes Podcast
Visit Void Star Podcast’s page on iTunes Podcast Portal. Please Click ‘Subscribe’, leave a comment.