Commit 17674ef2 authored by Alexander Gerasiov's avatar Alexander Gerasiov

New upstream version 1.0+dfsg1

parents
*.exe
*.obj
*.o
*.dSYM
benchmark
primes
tester
/build
CMakeCache.txt
CMakeFiles
CMakeScripts
Makefile
cmake_install.cmake
install_manifest.txt
CTestTestfile.cmake
# Changelog
This is a list of notable changes to libdivide.
## [1.0](https://github.com/ridiculousfish/libdivide/releases/tag/v1.0) - 2018-01-21
* BREAKING
* Branchfull divider must not be ```0``` ([#38](https://github.com/ridiculousfish/libdivide/pull/38))
* Branchfree divider must not be ```-1```, ```0```, ```1``` ([#38](https://github.com/ridiculousfish/libdivide/pull/38))
* ENHANCEMENT
* Add proper error handling ([#38](https://github.com/ridiculousfish/libdivide/pull/38))
* Add C++ support for ```/=``` operator
* Speedup 64-bit divisor recovery by up to 30%
* Simplify C++ templates
* Add include guards to ```libdivide.h```!
* Get rid of ```goto``` in ```libdivide_128_div_64_to_64()```
* Use ```#if defined(MACRO)``` instead of ```#if MACRO```
* Silence compiler warnings from crash functions
* TESTING
* Tests should ```exit(1)``` on error, required by ```make test```
* Silence unused parameter warnings
* Silence GCC 7.2.0 maybe uninitialized warnings
* Silence unused return value warning
* BUILD
* Port build system from ```make``` to ```CMake```
* Automatically detect if the CPU and compiler support SSE2
* Automatically enable C++11
* DOCS
* Update build instructions in ```README.md```
* Update benchmark section with branchfree divider
* Add C example section
* Add C++ example section
* Add "Branchfull vs branchfree" section
* Add section about unswitching
* New ```CHANGELOG.md```file
cmake_minimum_required(VERSION 3.1)
project(libdivide C CXX)
include(CheckCXXCompilerFlag)
include(CMakePushCheckState)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DLIBDIVIDE_ASSERTIONS_ON")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DLIBDIVIDE_ASSERTIONS_ON")
# By default enable release mode ###############################
string(TOUPPER "${CMAKE_BUILD_TYPE}" BUILD_TYPE)
if(NOT "${BUILD_TYPE}" MATCHES DEBUG)
set(CMAKE_BUILD_TYPE Release)
endif()
# Enable GCC/Clang warnings ####################################
set(WALL_FLAGS "-Wall -Wno-unknown-pragmas -fstrict-aliasing")
check_cxx_compiler_flag("${WALL_FLAGS}" wall)
if(wall)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${WALL_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WALL_FLAGS}")
endif()
# Check -msse2 compiler flag ###################################
cmake_push_check_state()
set(CMAKE_REQUIRED_FLAGS -Werror)
check_cxx_compiler_flag(-msse2 msse2)
cmake_pop_check_state()
if(msse2)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
add_definitions(-DLIBDIVIDE_USE_SSE2)
endif()
# Build binaries ###############################################
find_package(Threads REQUIRED)
add_executable(tester libdivide_test.cpp)
add_executable(benchmark libdivide_benchmark.c)
add_executable(primes_benchmark primes_benchmark.cpp)
target_link_libraries(tester Threads::Threads)
# Enable testing ###############################################
enable_testing()
add_test(tester tester)
add_test(primes_benchmark primes_benchmark)
# By default install to /usr/local/install #####################
install(FILES libdivide.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
libdivide
Copyright (C) 2010 ridiculous_fish
libdivide is made available under two licenses. You may choose either
of the following licenses when using libdivide.
zlib License
------------
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Boost License
-------------
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
libdivide@ridiculousfish.com
# libdivide
[![Build Status](https://ci.appveyor.com/api/projects/status/github/ridiculousfish/libdivide?branch=master&svg=true)](https://ci.appveyor.com/project/kimwalisch/libdivide)
```libdivide.h``` is a header-only C/C++ library for optimizing integer division,
it has both a [C API](http://libdivide.com/documentation.html#c_api) and a
[C++ API](http://libdivide.com/documentation.html#cpp_api). This is a summary of
how to use libdivide's testing tools to develop on libdivide itself.
See http://libdivide.com for more information on libdivide.
libdivide has 2 test tools:
* A verification utility **tester** used to help ensure that the division algorithm is correct.
* A benchmarking utility **benchmark** used to measure the speed increase.
# Build instructions
The tester and benchmark programs can be built using cmake and a recent C++ compiler
that supports C++11 or later. Optionally ```libdivide.h``` can also be installed to
```/usr/local/include```.
```bash
cmake .
make -j
sudo make install
```
# Tester binary
You can pass the **tester** binary one or more of the following arguments: ```u32```,
```s32```, ```u64```, ```s64``` to test the four cases (signed, unsigned, 32 bit, or 64 bit), or
run it with no arguments to test all four. The tester is multithreaded so it can test multiple
cases simultaneously. The tester will verify the correctness of libdivide via a set of randomly
chosen denominators, by comparing the result of libdivide's division to hardware division. It
may take a long time to run, but it will output as soon as it finds a discrepancy.
# Benchmark binary
You can pass the **benchmark** binary one or more of the following arguments: ```u32```,
```s32```, ```u64```, ```s64``` to compare libdivide's speed against hardware division.
**benchmark** tests a simple function that inputs an array of random numerators and a single
divisor, and returns the sum of their quotients. It tests this using both hardware division, and
the various division approaches supported by libdivide, including vector division.
It will output data like this:
```bash
# system scalar scl_bf scl_us vector vec_bf vec_us gener algo
1 5.453 0.654 0.570 0.223 0.565 0.603 0.235 1.282 0
2 5.453 1.045 0.570 0.496 0.568 0.603 0.511 11.215 1
3 5.453 1.534 0.570 0.570 0.587 0.603 0.570 11.887 2
4 5.409 0.654 0.570 0.223 0.565 0.603 0.235 1.282 0
5 5.409 1.045 0.570 0.496 0.568 0.603 0.509 11.215 1
6 5.409 1.534 0.570 0.570 0.587 0.603 0.570 11.887 2
...
```
It will keep going as long as you let it, so it's best to stop it when you are happy with the
denominators tested. These columns have the following significance. All times are in
nanoseconds, and lower is better.
```bash
#: The divisor that is tested
system: Hardware divide time
scalar: libdivide time, using scalar functions
scl_bf: libdivide time, using branchfree scalar functions
scl_us: libdivide time, using scalar unswitching functions
vector: libdivide time, using vector functions
vec_bf: libdivide time, using branchfree vector functions
vec_us: libdivide time, using vector unswitching
gener: Time taken to generate the divider struct
algo: The algorithm used. See libdivide_*_get_algorithm
```
The benchmarking utility will also verify that each function returns the same value,
so **benchmark** is valuable for its verification as well.
# C++ example
The first code snippet divides all integers in a vector using integer division. This is slow as
integer division is at least one order of magnitude slower than any other integer arithmetic
operation on current CPUs.
```C++
void divide(std::vector<int64_t>& vect, int64_t divisor)
{
// Slow, uses integer division
for (auto& n : vect)
n /= divisor;
}
```
The second code snippet runs much faster, it uses libdivide to compute the integer division
using multiplication and bit shifts hence avoiding the slow integer divison operation.
```C++
void divide(std::vector<int64_t>& vect, int64_t divisor)
{
libdivide::divider<int64_t> fast_d(divisor);
// Fast, computes division using libdivide
for (auto& n : vect)
n /= fast_d;
}
```
Generally libdivide will give at significant speedup if:
* The divisor is only known at runtime
* The divisor is reused multiple times e.g. in a loop
# C example
When using libdivide's C API you first need to generate a libdivide divider using
one of the ```libdivide_*_gen``` functions (*:&nbsp;s32,&nbsp;u32,&nbsp;s64,&nbsp;u64)
which can then be used to compute the actual integer division using the
corresponding ```libdivide_*_do``` or ```libdivide_*_branchfree_do```functions.
```C
void divide(int64_t *array, size_t count, int64_t divisor)
{
struct libdivide_s64_t fast_d = libdivide_s64_gen(divisor);
// Fast, computes division using libdivide
for (size_t i = 0; i < count; i++)
array[i] = libdivide_s64_do(array[i], &fast_d);
}
```
For more information please visit the [C API documentation](http://libdivide.com/documentation.html#c_api) on libdivide's website.
# Branchfull vs branchfree
The default libdivide divider makes use of
[branches](https://en.wikipedia.org/wiki/Branch_(computer_science)) to compute the integer
division. When the same divider is used inside a hot loop as in the C++ example section the
CPU will accurately predict the branches and there will be no performance slowdown. Often
the compiler is even able to move the branches outside the body of the loop hence
completely eliminating the branches, this is called loop-invariant code motion.
If however you are e.g. iterating over an array of dividers the CPU will not accurately predict
the branches and this will deteriorate performance. For this use case the branchfree divider
type will often run significantly faster, it computes the integer division without use of any
branches.
```C++
// Use branchfree divider
using branchfree_t = libdivide::branchfree_divider<uint64_t>;
uint64_t divide(uint64_t x, std::vector<branchfree_t>& vect)
{
uint64_t sum = 0;
for (auto& fast_d : vect)
sum += x / fast_d;
return sum;
}
```
Caveats of branchfree divider:
* Branchfree divider cannot be ```-1```, ```0```, ```1```
* Faster for unsigned types than for signed types
# Unswitching
We mentioned in the "Branchfull vs branchfree" section that the default branchfull
libdivide divider uses branches. It is possible to get rid of the branches and the
preliminary checks when using the default branchfull divider using a technique
called unswitching. **Unswitching** moves out of the body of the loop the
preliminary algorithm check so that the computation inside the loop is branchfree.
```C++
using namespace libdivide;
void divide(std::vector<int64_t>& vect, int64_t divisor)
{
divider<int64_t> fast_d(divisor);
switch (fast_d.get_algorithm())
{
case 0: for (auto& n : vect) n /= unswitch<0>(fast_d); break;
case 1: for (auto& n : vect) n /= unswitch<1>(fast_d); break;
case 2: for (auto& n : vect) n /= unswitch<2>(fast_d); break;
case 3: for (auto& n : vect) n /= unswitch<3>(fast_d); break;
case 4: for (auto& n : vect) n /= unswitch<4>(fast_d); break;
}
}
```
For more information please visit the [API documentation](http://libdivide.com/documentation.html) on libdivide's website.
# Contributing
Before sending in patches to libdivide, please run the tester to completion with all four types,
and the benchmark utility for a reasonable period, to ensure that you have not introduced a
regression.
**Happy hacking!**
# Automated Windows (MSVC++) testing using appveyor.com
# https://ci.appveyor.com/projects
version: 1.0.{build}
branches:
only:
- master
platform:
- x86
- x64
configuration:
- Debug
environment:
CXXFLAGS: /DLIBDIVIDE_USE_SSE2 /DLIBDIVIDE_ASSERTIONS_ON
os: Visual Studio 2017
build_script:
- if "%platform%" == "x86" cmake . -G "Visual Studio 15 2017"
- if "%platform%" == "x64" cmake . -G "Visual Studio 15 2017 Win64"
- cmake --build . --config Release
test_script:
- cd Release
- tester.exe
- primes_benchmark.exe
/*
Reference implementations of computing and using the "magic number" approach to dividing
by constants, including codegen instructions. The unsigned division incorporates the
"round down" optimization per ridiculous_fish.
This is free and unencumbered software. Any copyright is dedicated to the Public Domain.
*/
#include <limits.h> //for CHAR_BIT
#include <assert.h>
/* Types used in the computations below. These can be redefined to the types appropriate
for the desired division type (i.e. uint can be defined as unsigned long long).
Note that the uint type is used in compute_signed_magic_info, so the uint type must
not be smaller than the sint type.
*/
typedef unsigned int uint;
typedef signed int sint;
/* Computes "magic info" for performing signed division by a fixed integer D.
The type 'sint' is assumed to be defined as a signed integer type large enough
to hold both the dividend and the divisor.
Here >> is arithmetic (signed) shift, and >>> is logical shift.
To emit code for n/d, rounding towards zero, use the following sequence:
m = compute_signed_magic_info(D)
emit("result = (m.multiplier * n) >> SINT_BITS");
if d > 0 and m.multiplier < 0: emit("result += n")
if d < 0 and m.multiplier > 0: emit("result -= n")
if m.post_shift > 0: emit("result >>= m.shift")
emit("result += (result < 0)")
The shifts by SINT_BITS may be "free" if the high half of the full multiply
is put in a separate register.
The final add can of course be implemented via the sign bit, e.g.
result += (result >>> (SINT_BITS - 1))
or
result -= (result >> (SINT_BITS - 1))
This code is heavily indebted to Hacker's Delight by Henry Warren.
See http://www.hackersdelight.org/HDcode/magic.c.txt
Used with permission from http://www.hackersdelight.org/permissions.htm
*/
struct magics_info {
sint multiplier; // the "magic number" multiplier
unsigned shift; // shift for the dividend after multiplying
};
struct magics_info compute_signed_magic_info(sint D);
/* Computes "magic info" for performing unsigned division by a fixed positive integer D.
The type 'uint' is assumed to be defined as an unsigned integer type large enough
to hold both the dividend and the divisor. num_bits can be set appropriately if n is
known to be smaller than the largest uint; if this is not known then pass
(sizeof(uint) * CHAR_BIT) for num_bits.
Assume we have a hardware register of width UINT_BITS, a known constant D which is
not zero and not a power of 2, and a variable n of width num_bits (which may be
up to UINT_BITS). To emit code for n/d, use one of the two following sequences
(here >>> refers to a logical bitshift):
m = compute_unsigned_magic_info(D, num_bits)
if m.pre_shift > 0: emit("n >>>= m.pre_shift")
if m.increment: emit("n = saturated_increment(n)")
emit("result = (m.multiplier * n) >>> UINT_BITS")
if m.post_shift > 0: emit("result >>>= m.post_shift")
or
m = compute_unsigned_magic_info(D, num_bits)
if m.pre_shift > 0: emit("n >>>= m.pre_shift")
emit("result = m.multiplier * n")
if m.increment: emit("result = result + m.multiplier")
emit("result >>>= UINT_BITS")
if m.post_shift > 0: emit("result >>>= m.post_shift")
The shifts by UINT_BITS may be "free" if the high half of the full multiply
is put in a separate register.
saturated_increment(n) means "increment n unless it would wrap to 0," i.e.
if n == (1 << UINT_BITS)-1: result = n
else: result = n+1
A common way to implement this is with the carry bit. For example, on x86:
add 1
sbb 0
Some invariants:
1: At least one of pre_shift and increment is zero
2: multiplier is never zero
This code incorporates the "round down" optimization per ridiculous_fish.
*/
struct magicu_info {
uint multiplier; // the "magic number" multiplier
unsigned pre_shift; // shift for the dividend before multiplying
unsigned post_shift; //shift for the dividend after multiplying
int increment; // 0 or 1; if set then increment the numerator, using one of the two strategies
};
struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits);
/* Implementations follow */
struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) {
//The numerator must fit in a uint
assert(num_bits > 0 && num_bits <= sizeof(uint) * CHAR_BIT);
// D must be larger than zero and not a power of 2
assert(D & (D-1));
// The eventual result
struct magicu_info result;
// Bits in a uint
const unsigned UINT_BITS = sizeof(uint) * CHAR_BIT;
// The extra shift implicit in the difference between UINT_BITS and num_bits
const unsigned extra_shift = UINT_BITS - num_bits;
// The initial power of 2 is one less than the first one that can possibly work
const uint initial_power_of_2 = (uint)1 << (UINT_BITS-1);
// The remainder and quotient of our power of 2 divided by d
uint quotient = initial_power_of_2 / D, remainder = initial_power_of_2 % D;
// ceil(log_2 D)
unsigned ceil_log_2_D;
// The magic info for the variant "round down" algorithm
uint down_multiplier = 0;
unsigned down_exponent = 0;
int has_magic_down = 0;
// Compute ceil(log_2 D)
ceil_log_2_D = 0;
uint tmp;
for (tmp = D; tmp > 0; tmp >>= 1)
ceil_log_2_D += 1;
// Begin a loop that increments the exponent, until we find a power of 2 that works.
unsigned exponent;
for (exponent = 0; ; exponent++) {
// Quotient and remainder is from previous exponent; compute it for this exponent.
if (remainder >= D - remainder) {
// Doubling remainder will wrap around D
quotient = quotient * 2 + 1;
remainder = remainder * 2 - D;
} else {
// Remainder will not wrap
quotient = quotient * 2;
remainder = remainder * 2;
}
// We're done if this exponent works for the round_up algorithm.
// Note that exponent may be larger than the maximum shift supported,
// so the check for >= ceil_log_2_D is critical.
if ((exponent + extra_shift >= ceil_log_2_D) || (D - remainder) <= ((uint)1 << (exponent + extra_shift)))
break;
// Set magic_down if we have not set it yet and this exponent works for the round_down algorithm
if (! has_magic_down && remainder <= ((uint)1 << (exponent + extra_shift))) {
has_magic_down = 1;
down_multiplier = quotient;
down_exponent = exponent;
}
}
if (exponent < ceil_log_2_D) {
// magic_up is efficient
result.multiplier = quotient + 1;
result.pre_shift = 0;
result.post_shift = exponent;
result.increment = 0;
} else if (D & 1) {
// Odd divisor, so use magic_down, which must have been set
assert(has_magic_down);
result.multiplier = down_multiplier;
result.pre_shift = 0;
result.post_shift = down_exponent;
result.increment = 1;
} else {
// Even divisor, so use a prefix-shifted dividend
unsigned pre_shift = 0;
uint shifted_D = D;
while ((shifted_D & 1) == 0) {
shifted_D >>= 1;
pre_shift += 1;
}
result = compute_unsigned_magic_info(shifted_D, num_bits - pre_shift);
assert(result.increment == 0 && result.pre_shift == 0); //expect no increment or pre_shift in this path
result.pre_shift = pre_shift;
}
return result;
}
struct magics_info compute_signed_magic_info(sint D) {
// D must not be zero and must not be a power of 2 (or its negative)
assert(D != 0 && (D & -D) != D && (D & -D) != -D);
// Our result
struct magics_info result;
// Bits in an sint
const unsigned SINT_BITS = sizeof(sint) * CHAR_BIT;
// Absolute value of D (we know D is not the most negative value since that's a power of 2)
const uint abs_d = (D < 0 ? -D : D);
// The initial power of 2 is one less than the first one that can possibly work
// "two31" in Warren
unsigned exponent = SINT_BITS - 1;
const uint initial_power_of_2 = (uint)1 << exponent;
// Compute the absolute value of our "test numerator,"
// which is the largest dividend whose remainder with d is d-1.
// This is called anc in Warren.
const uint tmp = initial_power_of_2 + (D < 0);
const uint abs_test_numer = tmp - 1 - tmp % abs_d;
// Initialize our quotients and remainders (q1, r1, q2, r2 in Warren)
uint quotient1 = initial_power_of_2 / abs_test_numer, remainder1 = initial_power_of_2 % abs_test_numer;
uint quotient2 = initial_power_of_2 / abs_d, remainder2 = initial_power_of_2 % abs_d;
uint delta;
// Begin our loop
do {
// Update the exponent
exponent++;
// Update quotient1 and remainder1
quotient1 *= 2;
remainder1 *= 2;
if (remainder1 >= abs_test_numer) {
quotient1 += 1;
remainder1 -= abs_test_numer;
}
// Update quotient2 and remainder2
quotient2 *= 2;
remainder2 *= 2;
if (remainder2 >= abs_d) {
quotient2 += 1;
remainder2 -= abs_d;
}
// Keep going as long as (2**exponent) / abs_d <= delta
delta = abs_d - remainder2;
} while (quotient1 < delta || (quotient1 == delta && remainder1 == 0));
result.multiplier = quotient2 + 1;
if (D < 0) result.multiplier = -result.multiplier;
result.shift = exponent - SINT_BITS;
return result;
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#include <algorithm>
#include <functional>
#include <iostream>
#include <chrono>
#include <deque>
#include <vector>
#include <cstring>
#include <cstdlib>
#include "libdivide.h"
#if defined(__GNUC__)
#define NOINLINE __attribute__((__noinline__))
#else
#define NOINLINE
#endif
template<typename T, int ALGO>
struct prime_divider_t {
T value;
libdivide::divider<T, ALGO> divider;
prime_divider_t(T v) : value(v), divider(v) {}
};
template<typename T, int ALGO> NOINLINE
size_t count_primes_libdivide(T max)
{
std::vector<prime_divider_t<T, ALGO> > primes;
primes.push_back(2);
for (T i=3; i < max; i+=2) {
bool is_prime = true;
for (const auto &prime : primes) {
T quotient = i / prime.divider;
T remainder = i - quotient * prime.value;
if (remainder == 0) {
is_prime = false;
break;
}
}
if (is_prime) {
primes.push_back(i);
}
}
return primes.size();
}
template<typename T> NOINLINE
size_t count_primes_system(T max)
{
std::vector<T> primes;
primes.push_back(2);
for (T i=3; i < max; i+=2) {
bool is_prime = true;
for (const auto &prime : primes) {
if (i % prime == 0) {
is_prime = false;
break;
}
}
if (is_prime) {
primes.push_back(i);
}
}
return primes.size();
}
template<typename Ret, typename... Args>
std::pair<double, Ret> time_function(std::function<Ret(Args...)> func, Args... args) {
using namespace std::chrono;
high_resolution_clock::time_point t1 = high_resolution_clock::now();
size_t result = func(args...);
high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time_span = duration_cast<duration<double>>(t2 - t1);
return std::make_pair(time_span.count(), result);
}
struct prime_calculation_result_t {
double duration;
size_t result;
};
template<typename T, size_t Func(T)>
prime_calculation_result_t measure_1_prime_calculation(T max, size_t iters) {