Requirements/ System Specifications.
Argon2 Password hashing function package:
https://github.com/P-H-C/phc-winner-argon2
Machine 1:
Aarch64 Fedora 28 version of Linux operating system
Cortex-A57 8 core processor
Two sticks of Dual-Channel DIMM DDR3 8GB RAM (16GB in total)
Machine 2:
Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.70GHz
Four sticks of 8GB DIMM DDR4 RAM at 2.4 GHz (32 GB of RAM in total)
x86_64 Fedora 28 version of Linux Operating System
Approach:
I will test the changed code on machine 1. This is a continuation of the last blog titled: “Project: Part3 – Optimizing and porting argon2 package using C and Assembler language(Progress 2)”.
Here is the modified version of bench.c from the argon2 password hashing function:
/*
* Argon2 reference source code package - reference C implementations
*
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*
* You may use this work under the terms of a Creative Commons CC0 1.0
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
* these licenses can be found at:
*
* - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
* - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
*
* You should have received a copy of both of these licenses along with this
* software. If not, they may be obtained at the above URLs.
*/
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#define BILLION 1000000000L;
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include "argon2.h"
/*
static uint64_t rdtsc(void) {
#ifdef _MSC_VER
return __rdtsc();
#else
#if defined(__amd64__) || defined(__x86_64__)
uint64_t rax, rdx;
__asm__ __volatile__("rdtsc" : "=a"(rax), "=d"(rdx) : :);
return (rdx << 32) | rax;
#elif defined(__i386__) || defined(__i386) || defined(__X86__)
uint64_t rax;
__asm__ __volatile__("rdtsc" : "=A"(rax) : :);
return rax;
#else
#error "Not implemented!"
#endif
#endif
}
*/
/*
* Benchmarks Argon2 with salt length 16, password length 16, t_cost 3,
and different m_cost and threads
*/
static void benchmark() {
#define BENCH_OUTLEN 16
#define BENCH_INLEN 16
const uint32_t inlen = BENCH_INLEN;
const unsigned outlen = BENCH_OUTLEN;
unsigned char out[BENCH_OUTLEN];
unsigned char pwd_array[BENCH_INLEN];
unsigned char salt_array[BENCH_INLEN];
#undef BENCH_INLEN
#undef BENCH_OUTLEN
struct timespec start, stop;
double accum;
uint32_t t_cost = 3;
uint32_t m_cost;
uint32_t thread_test[4] = {1, 2, 4, 8};
argon2_type types[3] = {Argon2_i, Argon2_d, Argon2_id};
memset(pwd_array, 0, inlen);
memset(salt_array, 1, inlen);
for (m_cost = (uint32_t)1 << 10; m_cost <= (uint32_t)1 << 22; m_cost *= 2) {
unsigned i;
for (i = 0; i < 4; ++i) {
double run_time = 0;
uint32_t thread_n = thread_test[i];
unsigned j;
for (j = 0; j < 3; ++j) {
/*clock_t start_time, stop_time;
uint64_t start_cycles, stop_cycles;
uint64_t delta;
double mcycles;*/
argon2_type type = types[j];
/*start_time = clock();
start_cycles = rdtsc();*/
if( clock_gettime( CLOCK_REALTIME, &start) == -1 ) {
perror( "clock gettime" );
exit( EXIT_FAILURE );
}
else
{
clock_gettime(CLOCK_REALTIME, &start);
}
argon2_hash(t_cost, m_cost, thread_n, pwd_array, inlen,
salt_array, inlen, out, outlen, NULL, 0, type,
ARGON2_VERSION_NUMBER);
/*stop_cycles = rdtsc();
stop_time = clock();*/
/*delta = (stop_cycles - start_cycles) / (m_cost);
mcycles = (double)(stop_cycles - start_cycles) / (1UL << 20);
run_time += ((double)stop_time - start_time) / (CLOCKS_PER_SEC);*/
if( clock_gettime( CLOCK_REALTIME, &stop) == -1 ) {
perror( "clock gettime" );
exit( EXIT_FAILURE );
}
else
{
clock_gettime(CLOCK_REALTIME, &stop);
}
accum = ( (double)stop.tv_sec - start.tv_sec )
+ ( (double)stop.tv_nsec - start.tv_nsec );
double mcycles = accum / (1UL << 20);
uint64_t delta = accum / (m_cost);
printf("%s %d iterations %d MiB %d threads: %2.2f cpb %2.2f "
"Mcycles \n", argon2_type2string(type, 1), t_cost,
m_cost >> 10, thread_n, (float)delta / 1024, mcycles);
run_time = 0;
run_time += accum / BILLION;
/*run_time += accum;
printf("%2.4f seconds\n\n", (double)run_time);*/
}
printf("%2.4f seconds\n\n", run_time);
}
}
}
int main() {
benchmark();
return ARGON2_OK;
}
The x86_64 basic test done in the previous blog shows how the program is intended to run. The program is suppose to count the amount of CPU cycles while running the program’s main code, “argon2_hash(t_cost, m_cost, thread_n, pwd_array, inlen,
salt_array, inlen, out, outlen, NULL, 0, type, ARGON2_VERSION_NUMBER);“. I did not expect the rdstc counter found in the x86_64 architecture to be such a sophisticated problem.
This is the portion of code that I assume did the math/ calculation of the CPU cycles:
delta = (stop_cycles - start_cycles) / (m_cost); mcycles = (double)(stop_cycles - start_cycles) / (1UL << 20); run_time += ((double)stop_time - start_time) / (CLOCKS_PER_SEC);
The calculation is straight-forward of delta being the value of the stop time subtracting the start time and finally divided by the variable m_cost. m_cost is generated from the for loop seen below:
for (m_cost = (uint32_t)1 << 10; m_cost <= (uint32_t)1 << 22; m_cost *= 2)
My mistake:
When looking at the original code I notice that the program had a variable that I forgot to include.
run_time += ((double)stop_time - start_time) / (CLOCKS_PER_SEC);
I made the change and rebuilt the program using the Makefile.
cc -std=c89 -O2 -Wall -g -Iinclude -Isrc -pthread src/argon2.c src/core.c src/blake2/blake2b.c src/thread.c src/encoding.c src/ref.c src/bench.c -o bench
Here is the changed code:
/*
* Argon2 reference source code package - reference C implementations
*
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*
* You may use this work under the terms of a Creative Commons CC0 1.0
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
* these licenses can be found at:
*
* - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
* - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
*
* You should have received a copy of both of these licenses along with this
* software. If not, they may be obtained at the above URLs.
*/
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#define BILLION 1000000000L;
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include "argon2.h"
/*
static uint64_t rdtsc(void) {
#ifdef _MSC_VER
return __rdtsc();
#else
#if defined(__amd64__) || defined(__x86_64__)
uint64_t rax, rdx;
__asm__ __volatile__("rdtsc" : "=a"(rax), "=d"(rdx) : :);
return (rdx << 32) | rax;
#elif defined(__i386__) || defined(__i386) || defined(__X86__)
uint64_t rax;
__asm__ __volatile__("rdtsc" : "=A"(rax) : :);
return rax;
#elif defined(__aarch64__)
return 1;
#else
return 0;
#endif
#endif
}
*/
/*
* Benchmarks Argon2 with salt length 16, password length 16, t_cost 3,
and different m_cost and threads
*/
static void benchmark() {
#define BENCH_OUTLEN 16
#define BENCH_INLEN 16
const uint32_t inlen = BENCH_INLEN;
const unsigned outlen = BENCH_OUTLEN;
unsigned char out[BENCH_OUTLEN];
unsigned char pwd_array[BENCH_INLEN];
unsigned char salt_array[BENCH_INLEN];
#undef BENCH_INLEN
#undef BENCH_OUTLEN
struct timespec start, stop;
double accum;
uint32_t t_cost = 3;
uint32_t m_cost;
uint32_t thread_test[4] = {1, 2, 4, 8};
argon2_type types[3] = {Argon2_i, Argon2_d, Argon2_id};
memset(pwd_array, 0, inlen);
memset(salt_array, 1, inlen);
for (m_cost = (uint32_t)1 << 10; m_cost <= (uint32_t)1 << 22; m_cost *= 2) {
unsigned i;
for (i = 0; i < 4; ++i) {
double run_time = 0;
uint32_t thread_n = thread_test[i];
unsigned j;
for (j = 0; j < 3; ++j) {
/*clock_t start_time, stop_time;
uint64_t start_cycles, stop_cycles;
uint64_t delta;
double mcycles;*/
argon2_type type = types[j];
/*start_time = clock();
start_cycles = rdtsc();*/
if( clock_gettime( CLOCK_REALTIME, &start) == -1 ) {
perror( "clock gettime" );
exit( EXIT_FAILURE );
}
else
{
clock_gettime(CLOCK_REALTIME, &start);
}
argon2_hash(t_cost, m_cost, thread_n, pwd_array, inlen,
salt_array, inlen, out, outlen, NULL, 0, type,
ARGON2_VERSION_NUMBER);
/*stop_cycles = rdtsc();
stop_time = clock();*/
/*delta = (stop_cycles - start_cycles) / (m_cost);
mcycles = (double)(stop_cycles - start_cycles) / (1UL << 20);
run_time += ((double)stop_time - start_time) / (CLOCKS_PER_SEC);*/
if( clock_gettime( CLOCK_REALTIME, &stop) == -1 ) {
perror( "clock gettime" );
exit( EXIT_FAILURE );
}
else
{
clock_gettime(CLOCK_REALTIME, &stop);
}
accum = ( (double)stop.tv_sec - (double)start.tv_sec )
+ ( (double)stop.tv_nsec - (double)start.tv_nsec );
double mcycles = accum / (1UL << 20);
uint64_t delta = accum / (m_cost);
printf("%s %d iterations %d MiB %d threads: %2.2f cpb %2.2f "
"Mcycles \n", argon2_type2string(type, 1), t_cost,
m_cost >> 10, thread_n, (float)delta / 1024, mcycles);
run_time += accum / BILLION
run_time += run_time / (CLOCKS_PER_SEC);
/*run_time += accum;
printf("%2.4f seconds\n\n", (double)run_time);*/
}
/*run_time = 0;
run_time += accum / BILLION;*/
printf("%2.4f seconds\n\n", run_time);
}
}
}
int main() {
benchmark();
return ARGON2_OK;
}
Command to run the program:
./bench
Result:
Argon2i 3 iterations 1 MiB 1 threads: 5.38 cpb 5.38 Mcycles Argon2d 3 iterations 1 MiB 1 threads: 4.97 cpb 4.97 Mcycles Argon2id 3 iterations 1 MiB 1 threads: 4.45 cpb 4.45 Mcycles 0.0155 seconds Argon2i 3 iterations 1 MiB 2 threads: 3.50 cpb 3.50 Mcycles Argon2d 3 iterations 1 MiB 2 threads: 3.21 cpb 3.21 Mcycles Argon2id 3 iterations 1 MiB 2 threads: 3.20 cpb 3.20 Mcycles 0.0104 seconds Argon2i 3 iterations 1 MiB 4 threads: 2.69 cpb 2.69 Mcycles Argon2d 3 iterations 1 MiB 4 threads: 2.61 cpb 2.61 Mcycles Argon2id 3 iterations 1 MiB 4 threads: 2.65 cpb 2.65 Mcycles 0.0083 seconds Argon2i 3 iterations 1 MiB 8 threads: 4.43 cpb 4.43 Mcycles Argon2d 3 iterations 1 MiB 8 threads: 4.41 cpb 4.41 Mcycles Argon2id 3 iterations 1 MiB 8 threads: 4.39 cpb 4.39 Mcycles 0.0139 seconds Argon2i 3 iterations 2 MiB 1 threads: 5.21 cpb 10.42 Mcycles Argon2d 3 iterations 2 MiB 1 threads: 4.98 cpb 9.95 Mcycles Argon2id 3 iterations 2 MiB 1 threads: 4.42 cpb 8.84 Mcycles 0.0306 seconds Argon2i 3 iterations 2 MiB 2 threads: 2.81 cpb 5.63 Mcycles Argon2d 3 iterations 2 MiB 2 threads: 2.73 cpb 5.47 Mcycles Argon2id 3 iterations 2 MiB 2 threads: 0.00 cpb -948.16 Mcycles -0.9826 seconds Argon2i 3 iterations 2 MiB 4 threads: 1.88 cpb 3.76 Mcycles Argon2d 3 iterations 2 MiB 4 threads: 1.90 cpb 3.80 Mcycles Argon2id 3 iterations 2 MiB 4 threads: 1.88 cpb 3.76 Mcycles 0.0119 seconds Argon2i 3 iterations 2 MiB 8 threads: 2.52 cpb 5.04 Mcycles Argon2d 3 iterations 2 MiB 8 threads: 2.54 cpb 5.08 Mcycles Argon2id 3 iterations 2 MiB 8 threads: 2.60 cpb 5.20 Mcycles 0.0161 seconds Argon2i 3 iterations 4 MiB 1 threads: 5.29 cpb 21.18 Mcycles Argon2d 3 iterations 4 MiB 1 threads: 4.75 cpb 19.00 Mcycles Argon2id 3 iterations 4 MiB 1 threads: 4.43 cpb 17.72 Mcycles 0.0607 seconds Argon2i 3 iterations 4 MiB 2 threads: 2.60 cpb 10.41 Mcycles Argon2d 3 iterations 4 MiB 2 threads: 2.57 cpb 10.27 Mcycles Argon2id 3 iterations 4 MiB 2 threads: 2.58 cpb 10.31 Mcycles 0.0325 seconds Argon2i 3 iterations 4 MiB 4 threads: 1.61 cpb 6.42 Mcycles Argon2d 3 iterations 4 MiB 4 threads: 1.59 cpb 6.37 Mcycles Argon2id 3 iterations 4 MiB 4 threads: 1.60 cpb 6.39 Mcycles 0.0201 seconds Argon2i 3 iterations 4 MiB 8 threads: 2.09 cpb 8.35 Mcycles Argon2d 3 iterations 4 MiB 8 threads: 2.06 cpb 8.25 Mcycles Argon2id 3 iterations 4 MiB 8 threads: 2.41 cpb 9.64 Mcycles 0.0275 seconds Argon2i 3 iterations 8 MiB 1 threads: 5.52 cpb 44.13 Mcycles Argon2d 3 iterations 8 MiB 1 threads: 5.00 cpb 40.03 Mcycles Argon2id 3 iterations 8 MiB 1 threads: 4.61 cpb 36.90 Mcycles 0.1269 seconds Argon2i 3 iterations 8 MiB 2 threads: 2.59 cpb 20.76 Mcycles Argon2d 3 iterations 8 MiB 2 threads: 2.57 cpb 20.56 Mcycles Argon2id 3 iterations 8 MiB 2 threads: 2.56 cpb 20.52 Mcycles 0.0648 seconds Argon2i 3 iterations 8 MiB 4 threads: 1.48 cpb 11.85 Mcycles Argon2d 3 iterations 8 MiB 4 threads: 1.49 cpb 11.88 Mcycles Argon2id 3 iterations 8 MiB 4 threads: 1.48 cpb 11.84 Mcycles 0.0373 seconds Argon2i 3 iterations 8 MiB 8 threads: 2.24 cpb 17.95 Mcycles Argon2d 3 iterations 8 MiB 8 threads: 0.00 cpb -939.59 Mcycles Argon2id 3 iterations 8 MiB 8 threads: 2.02 cpb 16.16 Mcycles -0.9495 seconds Argon2i 3 iterations 16 MiB 1 threads: 5.77 cpb 92.33 Mcycles Argon2d 3 iterations 16 MiB 1 threads: 5.31 cpb 84.99 Mcycles Argon2id 3 iterations 16 MiB 1 threads: 5.01 cpb 80.18 Mcycles 0.2700 seconds Argon2i 3 iterations 16 MiB 2 threads: 2.75 cpb 44.05 Mcycles Argon2d 3 iterations 16 MiB 2 threads: 2.73 cpb 43.68 Mcycles Argon2id 3 iterations 16 MiB 2 threads: 2.74 cpb 43.80 Mcycles 0.1379 seconds Argon2i 3 iterations 16 MiB 4 threads: 1.54 cpb 24.66 Mcycles Argon2d 3 iterations 16 MiB 4 threads: 1.51 cpb 24.24 Mcycles Argon2id 3 iterations 16 MiB 4 threads: 1.52 cpb 24.33 Mcycles 0.0768 seconds Argon2i 3 iterations 16 MiB 8 threads: 1.62 cpb 25.92 Mcycles Argon2d 3 iterations 16 MiB 8 threads: 1.68 cpb 26.85 Mcycles Argon2id 3 iterations 16 MiB 8 threads: 1.76 cpb 28.13 Mcycles 0.0848 seconds Argon2i 3 iterations 32 MiB 1 threads: 5.96 cpb 190.66 Mcycles Argon2d 3 iterations 32 MiB 1 threads: 5.88 cpb 188.16 Mcycles Argon2id 3 iterations 32 MiB 1 threads: 0.00 cpb -765.51 Mcycles -0.4055 seconds Argon2i 3 iterations 32 MiB 2 threads: 3.29 cpb 105.24 Mcycles Argon2d 3 iterations 32 MiB 2 threads: 3.25 cpb 104.07 Mcycles Argon2id 3 iterations 32 MiB 2 threads: 3.26 cpb 104.20 Mcycles 0.3287 seconds Argon2i 3 iterations 32 MiB 4 threads: 1.85 cpb 59.35 Mcycles Argon2d 3 iterations 32 MiB 4 threads: 1.84 cpb 58.92 Mcycles Argon2id 3 iterations 32 MiB 4 threads: 1.85 cpb 59.15 Mcycles 0.1860 seconds Argon2i 3 iterations 32 MiB 8 threads: 1.92 cpb 61.44 Mcycles Argon2d 3 iterations 32 MiB 8 threads: 1.84 cpb 58.89 Mcycles Argon2id 3 iterations 32 MiB 8 threads: 1.99 cpb 63.67 Mcycles 0.1929 seconds Argon2i 3 iterations 64 MiB 1 threads: 0.00 cpb -564.65 Mcycles Argon2d 3 iterations 64 MiB 1 threads: 6.02 cpb 385.31 Mcycles Argon2id 3 iterations 64 MiB 1 threads: 0.00 cpb -567.80 Mcycles -0.7834 seconds Argon2i 3 iterations 64 MiB 2 threads: 3.33 cpb 213.04 Mcycles Argon2d 3 iterations 64 MiB 2 threads: 3.30 cpb 210.98 Mcycles Argon2id 3 iterations 64 MiB 2 threads: 3.30 cpb 211.29 Mcycles 0.6662 seconds Argon2i 3 iterations 64 MiB 4 threads: 1.86 cpb 119.27 Mcycles Argon2d 3 iterations 64 MiB 4 threads: 0.00 cpb -835.44 Mcycles Argon2id 3 iterations 64 MiB 4 threads: 1.85 cpb 118.59 Mcycles -0.6266 seconds Argon2i 3 iterations 64 MiB 8 threads: 1.88 cpb 120.44 Mcycles Argon2d 3 iterations 64 MiB 8 threads: 1.94 cpb 124.37 Mcycles Argon2id 3 iterations 64 MiB 8 threads: 1.63 cpb 104.46 Mcycles 0.3662 seconds Argon2i 3 iterations 128 MiB 1 threads: 0.00 cpb -158.98 Mcycles Argon2d 3 iterations 128 MiB 1 threads: 0.00 cpb -167.45 Mcycles Argon2id 3 iterations 128 MiB 1 threads: 0.00 cpb -165.81 Mcycles -0.5162 seconds Argon2i 3 iterations 128 MiB 2 threads: 3.38 cpb 432.10 Mcycles Argon2d 3 iterations 128 MiB 2 threads: 3.34 cpb 427.70 Mcycles Argon2id 3 iterations 128 MiB 2 threads: 0.00 cpb -525.12 Mcycles 0.3509 seconds Argon2i 3 iterations 128 MiB 4 threads: 1.88 cpb 240.61 Mcycles Argon2d 3 iterations 128 MiB 4 threads: 1.86 cpb 238.46 Mcycles Argon2id 3 iterations 128 MiB 4 threads: 0.00 cpb -715.31 Mcycles -0.2477 seconds Argon2i 3 iterations 128 MiB 8 threads: 1.56 cpb 199.22 Mcycles Argon2d 3 iterations 128 MiB 8 threads: 1.72 cpb 219.92 Mcycles Argon2id 3 iterations 128 MiB 8 threads: 1.69 cpb 216.88 Mcycles 0.6669 seconds
I will change the placement of the equations in a hope to change the results.
Here is the changed code:
/*
* Argon2 reference source code package - reference C implementations
*
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*
* You may use this work under the terms of a Creative Commons CC0 1.0
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
* these licenses can be found at:
*
* - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
* - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
*
* You should have received a copy of both of these licenses along with this
* software. If not, they may be obtained at the above URLs.
*/
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#define BILLION 1000000000L;
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include "argon2.h"
/*
static uint64_t rdtsc(void) {
#ifdef _MSC_VER
return __rdtsc();
#else
#if defined(__amd64__) || defined(__x86_64__)
uint64_t rax, rdx;
__asm__ __volatile__("rdtsc" : "=a"(rax), "=d"(rdx) : :);
return (rdx << 32) | rax;
#elif defined(__i386__) || defined(__i386) || defined(__X86__)
uint64_t rax;
__asm__ __volatile__("rdtsc" : "=A"(rax) : :);
return rax;
#elif defined(__aarch64__)
return 1;
#else
return 0;
#endif
#endif
}
*/
/*
* Benchmarks Argon2 with salt length 16, password length 16, t_cost 3,
and different m_cost and threads
*/
static void benchmark() {
#define BENCH_OUTLEN 16
#define BENCH_INLEN 16
const uint32_t inlen = BENCH_INLEN;
const unsigned outlen = BENCH_OUTLEN;
unsigned char out[BENCH_OUTLEN];
unsigned char pwd_array[BENCH_INLEN];
unsigned char salt_array[BENCH_INLEN];
#undef BENCH_INLEN
#undef BENCH_OUTLEN
struct timespec start, stop;
double accum;
uint32_t t_cost = 3;
uint32_t m_cost;
uint32_t thread_test[4] = {1, 2, 4, 8};
argon2_type types[3] = {Argon2_i, Argon2_d, Argon2_id};
memset(pwd_array, 0, inlen);
memset(salt_array, 1, inlen);
for (m_cost = (uint32_t)1 << 10; m_cost <= (uint32_t)1 << 22; m_cost *= 2) {
unsigned i;
for (i = 0; i < 4; ++i) {
double run_time = 0;
uint32_t thread_n = thread_test[i];
unsigned j;
for (j = 0; j < 3; ++j) {
/*clock_t start_time, stop_time;
uint64_t start_cycles, stop_cycles;
uint64_t delta;
double mcycles;*/
argon2_type type = types[j];
/*start_time = clock();
start_cycles = rdtsc();*/
if( clock_gettime( CLOCK_REALTIME, &start) == -1 ) {
perror( "clock gettime" );
exit( EXIT_FAILURE );
}
else
{
clock_gettime(CLOCK_REALTIME, &start);
}
argon2_hash(t_cost, m_cost, thread_n, pwd_array, inlen,
salt_array, inlen, out, outlen, NULL, 0, type,
ARGON2_VERSION_NUMBER);
/*stop_cycles = rdtsc();
stop_time = clock();*/
/*delta = (stop_cycles - start_cycles) / (m_cost);
mcycles = (double)(stop_cycles - start_cycles) / (1UL << 20);
run_time += ((double)stop_time - start_time) / (CLOCKS_PER_SEC);*/
if( clock_gettime( CLOCK_REALTIME, &stop) == -1 ) {
perror( "clock gettime" );
exit( EXIT_FAILURE );
}
else
{
clock_gettime(CLOCK_REALTIME, &stop);
}
accum = ( (double)stop.tv_sec - (double)start.tv_sec )
+ ( (double)stop.tv_nsec - (double)start.tv_nsec ) / BILLION;
double mcycles = accum * BILLION;
mcycles = mcycles / (1UL << 20);
uint64_t delta = accum * BILLION;
delta = delta / (m_cost);
printf("%s %d iterations %d MiB %d threads: %2.2f cpb %2.2f "
"Mcycles \n", argon2_type2string(type, 1), t_cost,
m_cost >> 10, thread_n, (float)delta / 1024, mcycles);
run_time += run_time / (CLOCKS_PER_SEC);
/*run_time += accum;
printf("%2.4f seconds\n\n", (double)run_time);*/
}
printf("%2.4f seconds\n\n", run_time);
}
}
}
int main() {
benchmark();
return ARGON2_OK;
}
Here is the result:
Argon2i 3 iterations 1 MiB 1 threads: 5.61 cpb 5.61 Mcycles Argon2d 3 iterations 1 MiB 1 threads: 5.18 cpb 5.18 Mcycles Argon2id 3 iterations 1 MiB 1 threads: 4.64 cpb 4.64 Mcycles 0.0000 seconds Argon2i 3 iterations 1 MiB 2 threads: 3.64 cpb 3.64 Mcycles Argon2d 3 iterations 1 MiB 2 threads: 3.26 cpb 3.26 Mcycles Argon2id 3 iterations 1 MiB 2 threads: 3.29 cpb 3.29 Mcycles 0.0000 seconds Argon2i 3 iterations 1 MiB 4 threads: 2.69 cpb 2.69 Mcycles Argon2d 3 iterations 1 MiB 4 threads: 2.69 cpb 2.69 Mcycles Argon2id 3 iterations 1 MiB 4 threads: 2.64 cpb 2.64 Mcycles 0.0000 seconds Argon2i 3 iterations 1 MiB 8 threads: 4.44 cpb 4.44 Mcycles Argon2d 3 iterations 1 MiB 8 threads: 4.41 cpb 4.41 Mcycles Argon2id 3 iterations 1 MiB 8 threads: 4.45 cpb 4.45 Mcycles 0.0000 seconds Argon2i 3 iterations 2 MiB 1 threads: 5.45 cpb 10.90 Mcycles Argon2d 3 iterations 2 MiB 1 threads: 5.19 cpb 10.39 Mcycles Argon2id 3 iterations 2 MiB 1 threads: 4.67 cpb 9.34 Mcycles 0.0000 seconds Argon2i 3 iterations 2 MiB 2 threads: 2.95 cpb 5.90 Mcycles Argon2d 3 iterations 2 MiB 2 threads: 2.88 cpb 5.75 Mcycles Argon2id 3 iterations 2 MiB 2 threads: 2.91 cpb 5.83 Mcycles 0.0000 seconds Argon2i 3 iterations 2 MiB 4 threads: 2.09 cpb 4.18 Mcycles Argon2d 3 iterations 2 MiB 4 threads: 2.09 cpb 4.17 Mcycles Argon2id 3 iterations 2 MiB 4 threads: 1.94 cpb 3.88 Mcycles 0.0000 seconds Argon2i 3 iterations 2 MiB 8 threads: 2.44 cpb 4.88 Mcycles Argon2d 3 iterations 2 MiB 8 threads: 2.48 cpb 4.96 Mcycles Argon2id 3 iterations 2 MiB 8 threads: 2.63 cpb 5.26 Mcycles 0.0000 seconds Argon2i 3 iterations 4 MiB 1 threads: 5.52 cpb 22.07 Mcycles Argon2d 3 iterations 4 MiB 1 threads: 5.01 cpb 20.06 Mcycles Argon2id 3 iterations 4 MiB 1 threads: 4.70 cpb 18.79 Mcycles 0.0000 seconds Argon2i 3 iterations 4 MiB 2 threads: 2.78 cpb 11.13 Mcycles Argon2d 3 iterations 4 MiB 2 threads: 2.69 cpb 10.76 Mcycles Argon2id 3 iterations 4 MiB 2 threads: 2.71 cpb 10.83 Mcycles 0.0000 seconds Argon2i 3 iterations 4 MiB 4 threads: 1.68 cpb 6.73 Mcycles Argon2d 3 iterations 4 MiB 4 threads: 1.67 cpb 6.69 Mcycles Argon2id 3 iterations 4 MiB 4 threads: 1.68 cpb 6.74 Mcycles 0.0000 seconds Argon2i 3 iterations 4 MiB 8 threads: 2.24 cpb 8.98 Mcycles Argon2d 3 iterations 4 MiB 8 threads: 2.47 cpb 9.87 Mcycles Argon2id 3 iterations 4 MiB 8 threads: 1.94 cpb 7.76 Mcycles 0.0000 seconds Argon2i 3 iterations 8 MiB 1 threads: 5.71 cpb 45.69 Mcycles Argon2d 3 iterations 8 MiB 1 threads: 5.24 cpb 41.95 Mcycles Argon2id 3 iterations 8 MiB 1 threads: 4.87 cpb 38.96 Mcycles 0.0000 seconds Argon2i 3 iterations 8 MiB 2 threads: 2.71 cpb 21.71 Mcycles Argon2d 3 iterations 8 MiB 2 threads: 2.68 cpb 21.48 Mcycles Argon2id 3 iterations 8 MiB 2 threads: 2.68 cpb 21.46 Mcycles 0.0000 seconds Argon2i 3 iterations 8 MiB 4 threads: 1.55 cpb 12.43 Mcycles Argon2d 3 iterations 8 MiB 4 threads: 1.54 cpb 12.31 Mcycles Argon2id 3 iterations 8 MiB 4 threads: 1.56 cpb 12.46 Mcycles 0.0000 seconds Argon2i 3 iterations 8 MiB 8 threads: 1.77 cpb 14.15 Mcycles Argon2d 3 iterations 8 MiB 8 threads: 1.72 cpb 13.77 Mcycles Argon2id 3 iterations 8 MiB 8 threads: 1.80 cpb 14.39 Mcycles 0.0000 seconds Argon2i 3 iterations 16 MiB 1 threads: 5.97 cpb 95.46 Mcycles Argon2d 3 iterations 16 MiB 1 threads: 5.52 cpb 88.28 Mcycles Argon2id 3 iterations 16 MiB 1 threads: 5.21 cpb 83.43 Mcycles 0.0000 seconds Argon2i 3 iterations 16 MiB 2 threads: 2.87 cpb 45.92 Mcycles Argon2d 3 iterations 16 MiB 2 threads: 2.83 cpb 45.30 Mcycles Argon2id 3 iterations 16 MiB 2 threads: 2.84 cpb 45.51 Mcycles 0.0000 seconds Argon2i 3 iterations 16 MiB 4 threads: 1.59 cpb 25.43 Mcycles Argon2d 3 iterations 16 MiB 4 threads: 1.57 cpb 25.17 Mcycles Argon2id 3 iterations 16 MiB 4 threads: 1.58 cpb 25.32 Mcycles 0.0000 seconds Argon2i 3 iterations 16 MiB 8 threads: 1.92 cpb 30.72 Mcycles Argon2d 3 iterations 16 MiB 8 threads: 1.71 cpb 27.37 Mcycles Argon2id 3 iterations 16 MiB 8 threads: 1.78 cpb 28.47 Mcycles 0.0000 seconds Argon2i 3 iterations 32 MiB 1 threads: 6.19 cpb 198.09 Mcycles Argon2d 3 iterations 32 MiB 1 threads: 6.10 cpb 195.33 Mcycles Argon2id 3 iterations 32 MiB 1 threads: 6.11 cpb 195.65 Mcycles 0.0000 seconds Argon2i 3 iterations 32 MiB 2 threads: 3.39 cpb 108.50 Mcycles Argon2d 3 iterations 32 MiB 2 threads: 3.36 cpb 107.50 Mcycles Argon2id 3 iterations 32 MiB 2 threads: 3.36 cpb 107.38 Mcycles 0.0000 seconds Argon2i 3 iterations 32 MiB 4 threads: 1.91 cpb 61.22 Mcycles Argon2d 3 iterations 32 MiB 4 threads: 1.90 cpb 60.79 Mcycles Argon2id 3 iterations 32 MiB 4 threads: 1.90 cpb 60.86 Mcycles 0.0000 seconds Argon2i 3 iterations 32 MiB 8 threads: 1.90 cpb 60.93 Mcycles Argon2d 3 iterations 32 MiB 8 threads: 1.90 cpb 60.83 Mcycles Argon2id 3 iterations 32 MiB 8 threads: 1.97 cpb 62.99 Mcycles 0.0000 seconds Argon2i 3 iterations 64 MiB 1 threads: 6.32 cpb 404.43 Mcycles Argon2d 3 iterations 64 MiB 1 threads: 6.23 cpb 398.94 Mcycles Argon2id 3 iterations 64 MiB 1 threads: 6.24 cpb 399.53 Mcycles 0.0000 seconds Argon2i 3 iterations 64 MiB 2 threads: 3.45 cpb 220.50 Mcycles Argon2d 3 iterations 64 MiB 2 threads: 3.41 cpb 218.07 Mcycles Argon2id 3 iterations 64 MiB 2 threads: 3.42 cpb 218.96 Mcycles 0.0000 seconds Argon2i 3 iterations 64 MiB 4 threads: 1.92 cpb 123.16 Mcycles Argon2d 3 iterations 64 MiB 4 threads: 1.91 cpb 122.17 Mcycles Argon2id 3 iterations 64 MiB 4 threads: 1.91 cpb 122.42 Mcycles 0.0000 seconds Argon2i 3 iterations 64 MiB 8 threads: 1.82 cpb 116.25 Mcycles Argon2d 3 iterations 64 MiB 8 threads: 1.84 cpb 117.60 Mcycles Argon2id 3 iterations 64 MiB 8 threads: 1.87 cpb 119.54 Mcycles 0.0000 seconds
The result did show positive numbers for Mcycles but I accidentally removed the equation to calculate the time at the end. I will fix that now.
Here is the changed code:
/*
* Argon2 reference source code package - reference C implementations
*
* Copyright 2015
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
*
* You may use this work under the terms of a Creative Commons CC0 1.0
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
* these licenses can be found at:
*
* - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
* - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
*
* You should have received a copy of both of these licenses along with this
* software. If not, they may be obtained at the above URLs.
*/
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#define BILLION 1000000000L;
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include "argon2.h"
/*
static uint64_t rdtsc(void) {
#ifdef _MSC_VER
return __rdtsc();
#else
#if defined(__amd64__) || defined(__x86_64__)
uint64_t rax, rdx;
__asm__ __volatile__("rdtsc" : "=a"(rax), "=d"(rdx) : :);
return (rdx << 32) | rax;
#elif defined(__i386__) || defined(__i386) || defined(__X86__)
uint64_t rax;
__asm__ __volatile__("rdtsc" : "=A"(rax) : :);
return rax;
#elif defined(__aarch64__)
return 1;
#else
return 0;
#endif
#endif
}
*/
/*
* Benchmarks Argon2 with salt length 16, password length 16, t_cost 3,
and different m_cost and threads
*/
static void benchmark() {
#define BENCH_OUTLEN 16
#define BENCH_INLEN 16
const uint32_t inlen = BENCH_INLEN;
const unsigned outlen = BENCH_OUTLEN;
unsigned char out[BENCH_OUTLEN];
unsigned char pwd_array[BENCH_INLEN];
unsigned char salt_array[BENCH_INLEN];
#undef BENCH_INLEN
#undef BENCH_OUTLEN
struct timespec start, stop;
double accum;
uint32_t t_cost = 3;
uint32_t m_cost;
uint32_t thread_test[4] = {1, 2, 4, 8};
argon2_type types[3] = {Argon2_i, Argon2_d, Argon2_id};
memset(pwd_array, 0, inlen);
memset(salt_array, 1, inlen);
for (m_cost = (uint32_t)1 << 10; m_cost <= (uint32_t)1 << 22; m_cost *= 2) {
unsigned i;
for (i = 0; i < 4; ++i) {
double run_time = 0;
uint32_t thread_n = thread_test[i];
unsigned j;
for (j = 0; j < 3; ++j) {
/*clock_t start_time, stop_time;
uint64_t start_cycles, stop_cycles;
uint64_t delta;
double mcycles;*/
argon2_type type = types[j];
/*start_time = clock();
start_cycles = rdtsc();*/
if( clock_gettime( CLOCK_REALTIME, &start) == -1 ) {
perror( "clock gettime" );
exit( EXIT_FAILURE );
}
else
{
clock_gettime(CLOCK_REALTIME, &start);
}
argon2_hash(t_cost, m_cost, thread_n, pwd_array, inlen,
salt_array, inlen, out, outlen, NULL, 0, type,
ARGON2_VERSION_NUMBER);
/*stop_cycles = rdtsc();
stop_time = clock();*/
/*delta = (stop_cycles - start_cycles) / (m_cost);
mcycles = (double)(stop_cycles - start_cycles) / (1UL << 20);
run_time += ((double)stop_time - start_time) / (CLOCKS_PER_SEC);*/
if( clock_gettime( CLOCK_REALTIME, &stop) == -1 ) {
perror( "clock gettime" );
exit( EXIT_FAILURE );
}
else
{
clock_gettime(CLOCK_REALTIME, &stop);
}
accum = ( (double)stop.tv_sec - (double)start.tv_sec )
+ ( (double)stop.tv_nsec - (double)start.tv_nsec ) / BILLION;
double mcycles = accum * BILLION;
mcycles = mcycles / (1UL << 20);
uint64_t delta = accum * BILLION;
delta = delta / (m_cost);
printf("%s %d iterations %d MiB %d threads: %2.2f cpb %2.2f "
"Mcycles \n", argon2_type2string(type, 1), t_cost,
m_cost >> 10, thread_n, (float)delta / 1024, mcycles);
run_time += run_time / (CLOCKS_PER_SEC);
/*run_time += accum;
printf("%2.4f seconds\n\n", (double)run_time);*/
}
/*run_time = 0;*/
run_time += accum;
printf("%2.4f seconds\n\n", run_time);
}
}
}
int main() {
benchmark();
return ARGON2_OK;
}
Hopefully it works now.
Rebuild and test.
Result:
Argon2i 3 iterations 1 MiB 1 threads: 5.24 cpb 5.24 Mcycles Argon2d 3 iterations 1 MiB 1 threads: 4.89 cpb 4.90 Mcycles Argon2id 3 iterations 1 MiB 1 threads: 4.40 cpb 4.40 Mcycles 0.0046 seconds Argon2i 3 iterations 1 MiB 2 threads: 3.46 cpb 3.46 Mcycles Argon2d 3 iterations 1 MiB 2 threads: 3.13 cpb 3.13 Mcycles Argon2id 3 iterations 1 MiB 2 threads: 3.16 cpb 3.16 Mcycles 0.0033 seconds Argon2i 3 iterations 1 MiB 4 threads: 2.65 cpb 2.65 Mcycles Argon2d 3 iterations 1 MiB 4 threads: 2.58 cpb 2.58 Mcycles Argon2id 3 iterations 1 MiB 4 threads: 2.61 cpb 2.61 Mcycles 0.0027 seconds Argon2i 3 iterations 1 MiB 8 threads: 4.36 cpb 4.36 Mcycles Argon2d 3 iterations 1 MiB 8 threads: 4.27 cpb 4.27 Mcycles Argon2id 3 iterations 1 MiB 8 threads: 4.25 cpb 4.25 Mcycles 0.0045 seconds Argon2i 3 iterations 2 MiB 1 threads: 5.20 cpb 10.41 Mcycles Argon2d 3 iterations 2 MiB 1 threads: 4.93 cpb 9.86 Mcycles Argon2id 3 iterations 2 MiB 1 threads: 4.41 cpb 8.82 Mcycles 0.0092 seconds Argon2i 3 iterations 2 MiB 2 threads: 2.83 cpb 5.65 Mcycles Argon2d 3 iterations 2 MiB 2 threads: 2.72 cpb 5.44 Mcycles Argon2id 3 iterations 2 MiB 2 threads: 2.73 cpb 5.47 Mcycles 0.0057 seconds Argon2i 3 iterations 2 MiB 4 threads: 1.87 cpb 3.73 Mcycles Argon2d 3 iterations 2 MiB 4 threads: 1.99 cpb 3.98 Mcycles Argon2id 3 iterations 2 MiB 4 threads: 1.87 cpb 3.74 Mcycles 0.0039 seconds Argon2i 3 iterations 2 MiB 8 threads: 2.46 cpb 4.93 Mcycles Argon2d 3 iterations 2 MiB 8 threads: 2.52 cpb 5.05 Mcycles Argon2id 3 iterations 2 MiB 8 threads: 2.55 cpb 5.10 Mcycles 0.0053 seconds Argon2i 3 iterations 4 MiB 1 threads: 5.28 cpb 21.11 Mcycles Argon2d 3 iterations 4 MiB 1 threads: 4.80 cpb 19.21 Mcycles Argon2id 3 iterations 4 MiB 1 threads: 4.56 cpb 18.22 Mcycles 0.0191 seconds Argon2i 3 iterations 4 MiB 2 threads: 2.67 cpb 10.66 Mcycles Argon2d 3 iterations 4 MiB 2 threads: 2.56 cpb 10.25 Mcycles Argon2id 3 iterations 4 MiB 2 threads: 2.57 cpb 10.27 Mcycles 0.0108 seconds Argon2i 3 iterations 4 MiB 4 threads: 1.61 cpb 6.42 Mcycles Argon2d 3 iterations 4 MiB 4 threads: 1.57 cpb 6.29 Mcycles Argon2id 3 iterations 4 MiB 4 threads: 2.26 cpb 9.03 Mcycles 0.0095 seconds Argon2i 3 iterations 4 MiB 8 threads: 2.43 cpb 9.74 Mcycles Argon2d 3 iterations 4 MiB 8 threads: 1.99 cpb 7.95 Mcycles Argon2id 3 iterations 4 MiB 8 threads: 2.15 cpb 8.61 Mcycles 0.0090 seconds Argon2i 3 iterations 8 MiB 1 threads: 5.50 cpb 43.97 Mcycles Argon2d 3 iterations 8 MiB 1 threads: 5.06 cpb 40.49 Mcycles Argon2id 3 iterations 8 MiB 1 threads: 4.63 cpb 37.06 Mcycles 0.0389 seconds Argon2i 3 iterations 8 MiB 2 threads: 2.62 cpb 20.97 Mcycles Argon2d 3 iterations 8 MiB 2 threads: 2.56 cpb 20.48 Mcycles Argon2id 3 iterations 8 MiB 2 threads: 2.57 cpb 20.53 Mcycles 0.0215 seconds Argon2i 3 iterations 8 MiB 4 threads: 1.49 cpb 11.91 Mcycles Argon2d 3 iterations 8 MiB 4 threads: 1.46 cpb 11.69 Mcycles Argon2id 3 iterations 8 MiB 4 threads: 1.47 cpb 11.74 Mcycles 0.0123 seconds Argon2i 3 iterations 8 MiB 8 threads: 1.96 cpb 15.66 Mcycles Argon2d 3 iterations 8 MiB 8 threads: 1.73 cpb 13.82 Mcycles Argon2id 3 iterations 8 MiB 8 threads: 1.86 cpb 14.86 Mcycles 0.0156 seconds Argon2i 3 iterations 16 MiB 1 threads: 5.75 cpb 92.08 Mcycles Argon2d 3 iterations 16 MiB 1 threads: 5.29 cpb 84.71 Mcycles Argon2id 3 iterations 16 MiB 1 threads: 5.01 cpb 80.20 Mcycles 0.0841 seconds Argon2i 3 iterations 16 MiB 2 threads: 2.75 cpb 44.01 Mcycles Argon2d 3 iterations 16 MiB 2 threads: 2.73 cpb 43.66 Mcycles Argon2id 3 iterations 16 MiB 2 threads: 2.72 cpb 43.55 Mcycles 0.0457 seconds Argon2i 3 iterations 16 MiB 4 threads: 1.52 cpb 24.39 Mcycles Argon2d 3 iterations 16 MiB 4 threads: 1.50 cpb 24.08 Mcycles Argon2id 3 iterations 16 MiB 4 threads: 1.51 cpb 24.14 Mcycles 0.0253 seconds Argon2i 3 iterations 16 MiB 8 threads: 1.70 cpb 27.21 Mcycles Argon2d 3 iterations 16 MiB 8 threads: 1.67 cpb 26.80 Mcycles Argon2id 3 iterations 16 MiB 8 threads: 1.70 cpb 27.21 Mcycles 0.0285 seconds Argon2i 3 iterations 32 MiB 1 threads: 5.93 cpb 189.81 Mcycles Argon2d 3 iterations 32 MiB 1 threads: 5.88 cpb 188.10 Mcycles Argon2id 3 iterations 32 MiB 1 threads: 5.86 cpb 187.57 Mcycles 0.1967 seconds Argon2i 3 iterations 32 MiB 2 threads: 3.29 cpb 105.13 Mcycles Argon2d 3 iterations 32 MiB 2 threads: 3.25 cpb 103.96 Mcycles Argon2id 3 iterations 32 MiB 2 threads: 3.25 cpb 104.06 Mcycles 0.1091 seconds Argon2i 3 iterations 32 MiB 4 threads: 1.85 cpb 59.28 Mcycles Argon2d 3 iterations 32 MiB 4 threads: 1.84 cpb 58.83 Mcycles Argon2id 3 iterations 32 MiB 4 threads: 1.84 cpb 58.88 Mcycles 0.0617 seconds Argon2i 3 iterations 32 MiB 8 threads: 1.82 cpb 58.35 Mcycles Argon2d 3 iterations 32 MiB 8 threads: 1.99 cpb 63.75 Mcycles Argon2id 3 iterations 32 MiB 8 threads: 1.88 cpb 60.21 Mcycles 0.0631 seconds Argon2i 3 iterations 64 MiB 1 threads: 6.07 cpb 388.65 Mcycles Argon2d 3 iterations 64 MiB 1 threads: 6.01 cpb 384.52 Mcycles Argon2id 3 iterations 64 MiB 1 threads: 6.02 cpb 385.18 Mcycles 0.4039 seconds Argon2i 3 iterations 64 MiB 2 threads: 3.34 cpb 213.63 Mcycles Argon2d 3 iterations 64 MiB 2 threads: 3.30 cpb 211.42 Mcycles Argon2id 3 iterations 64 MiB 2 threads: 3.30 cpb 211.20 Mcycles 0.2215 seconds Argon2i 3 iterations 64 MiB 4 threads: 1.87 cpb 119.59 Mcycles Argon2d 3 iterations 64 MiB 4 threads: 1.84 cpb 118.12 Mcycles Argon2id 3 iterations 64 MiB 4 threads: 1.85 cpb 118.15 Mcycles 0.1239 seconds Argon2i 3 iterations 64 MiB 8 threads: 1.74 cpb 111.63 Mcycles Argon2d 3 iterations 64 MiB 8 threads: 1.76 cpb 112.49 Mcycles Argon2id 3 iterations 64 MiB 8 threads: 1.85 cpb 118.57 Mcycles 0.1243 seconds Argon2i 3 iterations 128 MiB 1 threads: 6.20 cpb 793.29 Mcycles Argon2d 3 iterations 128 MiB 1 threads: 6.14 cpb 785.44 Mcycles Argon2id 3 iterations 128 MiB 1 threads: 6.14 cpb 786.33 Mcycles 0.8245 seconds Argon2i 3 iterations 128 MiB 2 threads: 3.38 cpb 432.51 Mcycles Argon2d 3 iterations 128 MiB 2 threads: 3.35 cpb 428.33 Mcycles Argon2id 3 iterations 128 MiB 2 threads: 3.35 cpb 428.92 Mcycles 0.4498 seconds Argon2i 3 iterations 128 MiB 4 threads: 1.88 cpb 240.65 Mcycles Argon2d 3 iterations 128 MiB 4 threads: 1.86 cpb 238.37 Mcycles Argon2id 3 iterations 128 MiB 4 threads: 1.86 cpb 238.47 Mcycles 0.2501 seconds Argon2i 3 iterations 128 MiB 8 threads: 1.60 cpb 205.20 Mcycles Argon2d 3 iterations 128 MiB 8 threads: 1.71 cpb 218.40 Mcycles Argon2id 3 iterations 128 MiB 8 threads: 1.77 cpb 227.16 Mcycles 0.2382 seconds Argon2i 3 iterations 256 MiB 1 threads: 6.30 cpb 1611.99 Mcycles Argon2d 3 iterations 256 MiB 1 threads: 6.24 cpb 1597.32 Mcycles Argon2id 3 iterations 256 MiB 1 threads: 6.25 cpb 1600.12 Mcycles 1.6778 seconds Argon2i 3 iterations 256 MiB 2 threads: 3.42 cpb 874.77 Mcycles Argon2d 3 iterations 256 MiB 2 threads: 3.39 cpb 867.53 Mcycles Argon2id 3 iterations 256 MiB 2 threads: 3.39 cpb 868.38 Mcycles 0.9106 seconds Argon2i 3 iterations 256 MiB 4 threads: 1.92 cpb 491.15 Mcycles Argon2d 3 iterations 256 MiB 4 threads: 1.88 cpb 481.03 Mcycles Argon2id 3 iterations 256 MiB 4 threads: 1.89 cpb 484.98 Mcycles 0.5085 seconds Argon2i 3 iterations 256 MiB 8 threads: 1.44 cpb 369.10 Mcycles Argon2d 3 iterations 256 MiB 8 threads: 1.63 cpb 418.42 Mcycles Argon2id 3 iterations 256 MiB 8 threads: 1.67 cpb 428.07 Mcycles 0.4489 seconds
The results seem successful. I will try again but with optimization level -O3 for the GNU gcc compiler flag option.
I can change the option by using Vim Editor.
command:
vi Makefile
I will change the following line:
CFLAGS += -std=c89 -O2 -Wall -g -Iinclude -Isrc
The change will look like this:
CFLAGS += -std=c89 -O3 -Wall -g -Iinclude -Isrc
I will save the file with the new changes and rebuild the program to test it.
command:
make bench
Result:
Argon2i 3 iterations 1 MiB 1 threads: 4.80 cpb 4.80 Mcycles Argon2d 3 iterations 1 MiB 1 threads: 4.52 cpb 4.52 Mcycles Argon2id 3 iterations 1 MiB 1 threads: 3.96 cpb 3.96 Mcycles 0.0042 seconds Argon2i 3 iterations 1 MiB 2 threads: 3.33 cpb 3.33 Mcycles Argon2d 3 iterations 1 MiB 2 threads: 2.92 cpb 2.92 Mcycles Argon2id 3 iterations 1 MiB 2 threads: 2.91 cpb 2.91 Mcycles 0.0031 seconds Argon2i 3 iterations 1 MiB 4 threads: 2.46 cpb 2.46 Mcycles Argon2d 3 iterations 1 MiB 4 threads: 2.43 cpb 2.43 Mcycles Argon2id 3 iterations 1 MiB 4 threads: 2.48 cpb 2.48 Mcycles 0.0026 seconds Argon2i 3 iterations 1 MiB 8 threads: 4.52 cpb 4.52 Mcycles Argon2d 3 iterations 1 MiB 8 threads: 4.39 cpb 4.39 Mcycles Argon2id 3 iterations 1 MiB 8 threads: 4.33 cpb 4.33 Mcycles 0.0045 seconds Argon2i 3 iterations 2 MiB 1 threads: 4.79 cpb 9.57 Mcycles Argon2d 3 iterations 2 MiB 1 threads: 4.52 cpb 9.04 Mcycles Argon2id 3 iterations 2 MiB 1 threads: 4.00 cpb 8.00 Mcycles 0.0084 seconds Argon2i 3 iterations 2 MiB 2 threads: 2.62 cpb 5.25 Mcycles Argon2d 3 iterations 2 MiB 2 threads: 2.58 cpb 5.17 Mcycles Argon2id 3 iterations 2 MiB 2 threads: 2.59 cpb 5.18 Mcycles 0.0054 seconds Argon2i 3 iterations 2 MiB 4 threads: 1.85 cpb 3.69 Mcycles Argon2d 3 iterations 2 MiB 4 threads: 1.85 cpb 3.70 Mcycles Argon2id 3 iterations 2 MiB 4 threads: 1.77 cpb 3.53 Mcycles 0.0037 seconds Argon2i 3 iterations 2 MiB 8 threads: 2.31 cpb 4.62 Mcycles Argon2d 3 iterations 2 MiB 8 threads: 2.42 cpb 4.84 Mcycles Argon2id 3 iterations 2 MiB 8 threads: 2.46 cpb 4.93 Mcycles 0.0052 seconds Argon2i 3 iterations 4 MiB 1 threads: 4.87 cpb 19.47 Mcycles Argon2d 3 iterations 4 MiB 1 threads: 4.39 cpb 17.55 Mcycles Argon2id 3 iterations 4 MiB 1 threads: 4.03 cpb 16.11 Mcycles 0.0169 seconds Argon2i 3 iterations 4 MiB 2 threads: 2.45 cpb 9.81 Mcycles Argon2d 3 iterations 4 MiB 2 threads: 2.40 cpb 9.61 Mcycles Argon2id 3 iterations 4 MiB 2 threads: 2.39 cpb 9.56 Mcycles 0.0100 seconds Argon2i 3 iterations 4 MiB 4 threads: 1.48 cpb 5.93 Mcycles Argon2d 3 iterations 4 MiB 4 threads: 1.47 cpb 5.87 Mcycles Argon2id 3 iterations 4 MiB 4 threads: 1.50 cpb 5.98 Mcycles 0.0063 seconds Argon2i 3 iterations 4 MiB 8 threads: 2.21 cpb 8.84 Mcycles Argon2d 3 iterations 4 MiB 8 threads: 2.05 cpb 8.19 Mcycles Argon2id 3 iterations 4 MiB 8 threads: 2.13 cpb 8.53 Mcycles 0.0089 seconds Argon2i 3 iterations 8 MiB 1 threads: 5.14 cpb 41.16 Mcycles Argon2d 3 iterations 8 MiB 1 threads: 4.62 cpb 36.95 Mcycles Argon2id 3 iterations 8 MiB 1 threads: 4.23 cpb 33.81 Mcycles 0.0355 seconds Argon2i 3 iterations 8 MiB 2 threads: 2.42 cpb 19.33 Mcycles Argon2d 3 iterations 8 MiB 2 threads: 2.38 cpb 19.03 Mcycles Argon2id 3 iterations 8 MiB 2 threads: 2.38 cpb 19.03 Mcycles 0.0200 seconds Argon2i 3 iterations 8 MiB 4 threads: 1.38 cpb 11.09 Mcycles Argon2d 3 iterations 8 MiB 4 threads: 1.38 cpb 11.00 Mcycles Argon2id 3 iterations 8 MiB 4 threads: 1.38 cpb 11.07 Mcycles 0.0116 seconds Argon2i 3 iterations 8 MiB 8 threads: 1.73 cpb 13.88 Mcycles Argon2d 3 iterations 8 MiB 8 threads: 1.81 cpb 14.47 Mcycles Argon2id 3 iterations 8 MiB 8 threads: 1.90 cpb 15.24 Mcycles 0.0160 seconds Argon2i 3 iterations 16 MiB 1 threads: 5.39 cpb 86.31 Mcycles Argon2d 3 iterations 16 MiB 1 threads: 4.93 cpb 78.84 Mcycles Argon2id 3 iterations 16 MiB 1 threads: 4.66 cpb 74.55 Mcycles 0.0782 seconds Argon2i 3 iterations 16 MiB 2 threads: 2.59 cpb 41.41 Mcycles Argon2d 3 iterations 16 MiB 2 threads: 2.56 cpb 40.95 Mcycles Argon2id 3 iterations 16 MiB 2 threads: 2.57 cpb 41.09 Mcycles 0.0431 seconds Argon2i 3 iterations 16 MiB 4 threads: 1.47 cpb 23.47 Mcycles Argon2d 3 iterations 16 MiB 4 threads: 1.46 cpb 23.35 Mcycles Argon2id 3 iterations 16 MiB 4 threads: 1.44 cpb 23.05 Mcycles 0.0242 seconds Argon2i 3 iterations 16 MiB 8 threads: 1.69 cpb 27.07 Mcycles Argon2d 3 iterations 16 MiB 8 threads: 1.71 cpb 27.36 Mcycles Argon2id 3 iterations 16 MiB 8 threads: 1.60 cpb 25.60 Mcycles 0.0268 seconds Argon2i 3 iterations 32 MiB 1 threads: 5.56 cpb 178.05 Mcycles Argon2d 3 iterations 32 MiB 1 threads: 5.48 cpb 175.31 Mcycles Argon2id 3 iterations 32 MiB 1 threads: 5.49 cpb 175.62 Mcycles 0.1841 seconds Argon2i 3 iterations 32 MiB 2 threads: 3.10 cpb 99.33 Mcycles Argon2d 3 iterations 32 MiB 2 threads: 3.07 cpb 98.24 Mcycles Argon2id 3 iterations 32 MiB 2 threads: 3.07 cpb 98.39 Mcycles 0.1032 seconds Argon2i 3 iterations 32 MiB 4 threads: 1.78 cpb 56.83 Mcycles Argon2d 3 iterations 32 MiB 4 threads: 1.76 cpb 56.34 Mcycles Argon2id 3 iterations 32 MiB 4 threads: 1.76 cpb 56.46 Mcycles 0.0592 seconds Argon2i 3 iterations 32 MiB 8 threads: 1.80 cpb 57.72 Mcycles Argon2d 3 iterations 32 MiB 8 threads: 1.75 cpb 56.17 Mcycles Argon2id 3 iterations 32 MiB 8 threads: 1.80 cpb 57.75 Mcycles 0.0606 seconds Argon2i 3 iterations 64 MiB 1 threads: 5.69 cpb 364.37 Mcycles Argon2d 3 iterations 64 MiB 1 threads: 5.63 cpb 360.52 Mcycles Argon2id 3 iterations 64 MiB 1 threads: 5.64 cpb 361.19 Mcycles 0.3787 seconds Argon2i 3 iterations 64 MiB 2 threads: 3.17 cpb 203.00 Mcycles Argon2d 3 iterations 64 MiB 2 threads: 3.14 cpb 200.72 Mcycles Argon2id 3 iterations 64 MiB 2 threads: 3.14 cpb 201.11 Mcycles 0.2109 seconds Argon2i 3 iterations 64 MiB 4 threads: 1.79 cpb 114.35 Mcycles Argon2d 3 iterations 64 MiB 4 threads: 1.77 cpb 113.36 Mcycles Argon2id 3 iterations 64 MiB 4 threads: 1.78 cpb 114.01 Mcycles 0.1195 seconds Argon2i 3 iterations 64 MiB 8 threads: 1.69 cpb 108.44 Mcycles Argon2d 3 iterations 64 MiB 8 threads: 1.72 cpb 109.93 Mcycles Argon2id 3 iterations 64 MiB 8 threads: 1.70 cpb 108.90 Mcycles 0.1142 seconds Argon2i 3 iterations 128 MiB 1 threads: 5.81 cpb 743.61 Mcycles Argon2d 3 iterations 128 MiB 1 threads: 5.76 cpb 737.17 Mcycles Argon2id 3 iterations 128 MiB 1 threads: 5.76 cpb 737.74 Mcycles 0.7736 seconds Argon2i 3 iterations 128 MiB 2 threads: 3.23 cpb 413.39 Mcycles Argon2d 3 iterations 128 MiB 2 threads: 3.20 cpb 409.93 Mcycles Argon2id 3 iterations 128 MiB 2 threads: 3.20 cpb 410.16 Mcycles 0.4301 seconds Argon2i 3 iterations 128 MiB 4 threads: 1.80 cpb 230.53 Mcycles Argon2d 3 iterations 128 MiB 4 threads: 1.79 cpb 228.66 Mcycles Argon2id 3 iterations 128 MiB 4 threads: 1.78 cpb 228.44 Mcycles 0.2395 seconds Argon2i 3 iterations 128 MiB 8 threads: 1.69 cpb 216.05 Mcycles Argon2d 3 iterations 128 MiB 8 threads: 1.62 cpb 207.76 Mcycles Argon2id 3 iterations 128 MiB 8 threads: 1.65 cpb 211.43 Mcycles 0.2217 seconds Argon2i 3 iterations 256 MiB 1 threads: 5.93 cpb 1517.87 Mcycles Argon2d 3 iterations 256 MiB 1 threads: 5.87 cpb 1503.31 Mcycles Argon2id 3 iterations 256 MiB 1 threads: 5.88 cpb 1505.68 Mcycles 1.5788 seconds Argon2i 3 iterations 256 MiB 2 threads: 3.27 cpb 838.35 Mcycles Argon2d 3 iterations 256 MiB 2 threads: 3.25 cpb 831.07 Mcycles Argon2id 3 iterations 256 MiB 2 threads: 3.25 cpb 831.79 Mcycles 0.8722 seconds Argon2i 3 iterations 256 MiB 4 threads: 1.81 cpb 464.17 Mcycles Argon2d 3 iterations 256 MiB 4 threads: 1.81 cpb 463.87 Mcycles Argon2id 3 iterations 256 MiB 4 threads: 1.80 cpb 461.07 Mcycles 0.4835 seconds Argon2i 3 iterations 256 MiB 8 threads: 1.53 cpb 390.76 Mcycles Argon2d 3 iterations 256 MiB 8 threads: 1.59 cpb 406.13 Mcycles Argon2id 3 iterations 256 MiB 8 threads: 1.60 cpb 409.85 Mcycles 0.4298 seconds
This seems like the tests were quite similar to the optimization level -O2. This could be from the additional writing of variables into memory.
Test 2(x86_64):
I will try the changed code on machine 2.
This machine as mentioned before has these specifications:
Machine 2:
Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.70GHz
Four sticks of 8GB DIMM DDR4 RAM at 2.4 GHz (32 GB of RAM in total)
x86_64 Fedora 28 version of Linux Operating System
I will do the test with optimization level -O2 for testing.
Compile the program:
cc -std=c89 -O2 -Wall -g -Iinclude -Isrc -pthread -march=native src/argon2.c src/core.c src/blake2/blake2b.c src/thread.c src/encoding.c src/opt.c src/bench.c -o bench
Result:
Argon2i 3 iterations 1 MiB 1 threads: 3.54 cpb 3.54 Mcycles Argon2d 3 iterations 1 MiB 1 threads: 3.20 cpb 3.20 Mcycles Argon2id 3 iterations 1 MiB 1 threads: 2.73 cpb 2.73 Mcycles 0.0029 seconds Argon2i 3 iterations 1 MiB 2 threads: 2.92 cpb 2.92 Mcycles Argon2d 3 iterations 1 MiB 2 threads: 2.34 cpb 2.34 Mcycles Argon2id 3 iterations 1 MiB 2 threads: 2.40 cpb 2.40 Mcycles 0.0025 seconds Argon2i 3 iterations 1 MiB 4 threads: 1.97 cpb 1.97 Mcycles Argon2d 3 iterations 1 MiB 4 threads: 1.87 cpb 1.87 Mcycles Argon2id 3 iterations 1 MiB 4 threads: 1.94 cpb 1.94 Mcycles 0.0020 seconds Argon2i 3 iterations 1 MiB 8 threads: 3.21 cpb 3.21 Mcycles Argon2d 3 iterations 1 MiB 8 threads: 3.00 cpb 3.00 Mcycles Argon2id 3 iterations 1 MiB 8 threads: 2.81 cpb 2.81 Mcycles 0.0030 seconds Argon2i 3 iterations 2 MiB 1 threads: 1.40 cpb 2.79 Mcycles Argon2d 3 iterations 2 MiB 1 threads: 1.21 cpb 2.42 Mcycles Argon2id 3 iterations 2 MiB 1 threads: 1.04 cpb 2.08 Mcycles 0.0022 seconds Argon2i 3 iterations 2 MiB 2 threads: 1.44 cpb 2.88 Mcycles Argon2d 3 iterations 2 MiB 2 threads: 1.36 cpb 2.72 Mcycles Argon2id 3 iterations 2 MiB 2 threads: 1.37 cpb 2.73 Mcycles 0.0029 seconds Argon2i 3 iterations 2 MiB 4 threads: 0.99 cpb 1.99 Mcycles Argon2d 3 iterations 2 MiB 4 threads: 1.11 cpb 2.21 Mcycles Argon2id 3 iterations 2 MiB 4 threads: 1.05 cpb 2.11 Mcycles 0.0022 seconds Argon2i 3 iterations 2 MiB 8 threads: 1.67 cpb 3.35 Mcycles Argon2d 3 iterations 2 MiB 8 threads: 1.54 cpb 3.08 Mcycles Argon2id 3 iterations 2 MiB 8 threads: 1.51 cpb 3.02 Mcycles 0.0032 seconds Argon2i 3 iterations 4 MiB 1 threads: 1.41 cpb 5.65 Mcycles Argon2d 3 iterations 4 MiB 1 threads: 1.09 cpb 4.38 Mcycles Argon2id 3 iterations 4 MiB 1 threads: 0.98 cpb 3.92 Mcycles 0.0041 seconds Argon2i 3 iterations 4 MiB 2 threads: 1.28 cpb 5.13 Mcycles Argon2d 3 iterations 4 MiB 2 threads: 1.21 cpb 4.85 Mcycles Argon2id 3 iterations 4 MiB 2 threads: 1.23 cpb 4.93 Mcycles 0.0052 seconds Argon2i 3 iterations 4 MiB 4 threads: 0.79 cpb 3.18 Mcycles Argon2d 3 iterations 4 MiB 4 threads: 0.79 cpb 3.18 Mcycles Argon2id 3 iterations 4 MiB 4 threads: 0.81 cpb 3.22 Mcycles 0.0034 seconds Argon2i 3 iterations 4 MiB 8 threads: 1.00 cpb 4.00 Mcycles Argon2d 3 iterations 4 MiB 8 threads: 0.89 cpb 3.58 Mcycles Argon2id 3 iterations 4 MiB 8 threads: 0.91 cpb 3.64 Mcycles 0.0038 seconds Argon2i 3 iterations 8 MiB 1 threads: 1.47 cpb 11.79 Mcycles Argon2d 3 iterations 8 MiB 1 threads: 1.13 cpb 9.08 Mcycles Argon2id 3 iterations 8 MiB 1 threads: 0.97 cpb 7.80 Mcycles 0.0082 seconds Argon2i 3 iterations 8 MiB 2 threads: 1.27 cpb 10.18 Mcycles Argon2d 3 iterations 8 MiB 2 threads: 0.87 cpb 6.95 Mcycles Argon2id 3 iterations 8 MiB 2 threads: 0.88 cpb 7.00 Mcycles 0.0073 seconds Argon2i 3 iterations 8 MiB 4 threads: 0.91 cpb 7.31 Mcycles Argon2d 3 iterations 8 MiB 4 threads: 0.80 cpb 6.42 Mcycles Argon2id 3 iterations 8 MiB 4 threads: 0.59 cpb 4.70 Mcycles 0.0049 seconds Argon2i 3 iterations 8 MiB 8 threads: 0.82 cpb 6.53 Mcycles Argon2d 3 iterations 8 MiB 8 threads: 0.83 cpb 6.63 Mcycles Argon2id 3 iterations 8 MiB 8 threads: 0.81 cpb 6.47 Mcycles 0.0068 seconds Argon2i 3 iterations 16 MiB 1 threads: 1.89 cpb 30.20 Mcycles Argon2d 3 iterations 16 MiB 1 threads: 1.33 cpb 21.22 Mcycles Argon2id 3 iterations 16 MiB 1 threads: 1.17 cpb 18.70 Mcycles 0.0196 seconds Argon2i 3 iterations 16 MiB 2 threads: 1.17 cpb 18.80 Mcycles Argon2d 3 iterations 16 MiB 2 threads: 0.81 cpb 13.03 Mcycles Argon2id 3 iterations 16 MiB 2 threads: 0.79 cpb 12.57 Mcycles 0.0132 seconds Argon2i 3 iterations 16 MiB 4 threads: 0.80 cpb 12.79 Mcycles Argon2d 3 iterations 16 MiB 4 threads: 0.56 cpb 8.97 Mcycles Argon2id 3 iterations 16 MiB 4 threads: 0.53 cpb 8.45 Mcycles 0.0089 seconds Argon2i 3 iterations 16 MiB 8 threads: 0.60 cpb 9.57 Mcycles Argon2d 3 iterations 16 MiB 8 threads: 0.64 cpb 10.22 Mcycles Argon2id 3 iterations 16 MiB 8 threads: 0.68 cpb 10.83 Mcycles 0.0114 seconds Argon2i 3 iterations 32 MiB 1 threads: 1.64 cpb 52.53 Mcycles Argon2d 3 iterations 32 MiB 1 threads: 1.50 cpb 47.89 Mcycles Argon2id 3 iterations 32 MiB 1 threads: 1.49 cpb 47.84 Mcycles 0.0502 seconds Argon2i 3 iterations 32 MiB 2 threads: 1.28 cpb 41.08 Mcycles Argon2d 3 iterations 32 MiB 2 threads: 1.29 cpb 41.17 Mcycles Argon2id 3 iterations 32 MiB 2 threads: 1.38 cpb 44.31 Mcycles 0.0465 seconds Argon2i 3 iterations 32 MiB 4 threads: 0.86 cpb 27.46 Mcycles Argon2d 3 iterations 32 MiB 4 threads: 0.74 cpb 23.58 Mcycles Argon2id 3 iterations 32 MiB 4 threads: 0.65 cpb 20.68 Mcycles 0.0217 seconds Argon2i 3 iterations 32 MiB 8 threads: 0.68 cpb 21.81 Mcycles Argon2d 3 iterations 32 MiB 8 threads: 0.69 cpb 22.09 Mcycles Argon2id 3 iterations 32 MiB 8 threads: 0.68 cpb 21.73 Mcycles 0.0228 seconds Argon2i 3 iterations 64 MiB 1 threads: 1.61 cpb 103.11 Mcycles Argon2d 3 iterations 64 MiB 1 threads: 1.58 cpb 101.05 Mcycles Argon2id 3 iterations 64 MiB 1 threads: 1.58 cpb 101.25 Mcycles 0.1062 seconds Argon2i 3 iterations 64 MiB 2 threads: 1.44 cpb 92.42 Mcycles Argon2d 3 iterations 64 MiB 2 threads: 1.18 cpb 75.76 Mcycles Argon2id 3 iterations 64 MiB 2 threads: 1.18 cpb 75.28 Mcycles 0.0789 seconds Argon2i 3 iterations 64 MiB 4 threads: 0.76 cpb 48.48 Mcycles Argon2d 3 iterations 64 MiB 4 threads: 0.65 cpb 41.49 Mcycles Argon2id 3 iterations 64 MiB 4 threads: 0.63 cpb 40.49 Mcycles 0.0425 seconds Argon2i 3 iterations 64 MiB 8 threads: 0.58 cpb 37.08 Mcycles Argon2d 3 iterations 64 MiB 8 threads: 0.61 cpb 38.88 Mcycles Argon2id 3 iterations 64 MiB 8 threads: 0.61 cpb 39.02 Mcycles 0.0409 seconds Argon2i 3 iterations 128 MiB 1 threads: 1.72 cpb 220.68 Mcycles Argon2d 3 iterations 128 MiB 1 threads: 1.65 cpb 211.20 Mcycles Argon2id 3 iterations 128 MiB 1 threads: 1.61 cpb 206.66 Mcycles 0.2167 seconds Argon2i 3 iterations 128 MiB 2 threads: 1.12 cpb 143.16 Mcycles Argon2d 3 iterations 128 MiB 2 threads: 1.11 cpb 142.53 Mcycles Argon2id 3 iterations 128 MiB 2 threads: 1.11 cpb 142.67 Mcycles 0.1496 seconds Argon2i 3 iterations 128 MiB 4 threads: 0.68 cpb 87.52 Mcycles Argon2d 3 iterations 128 MiB 4 threads: 0.68 cpb 86.96 Mcycles Argon2id 3 iterations 128 MiB 4 threads: 0.68 cpb 86.78 Mcycles 0.0910 seconds Argon2i 3 iterations 128 MiB 8 threads: 0.59 cpb 75.56 Mcycles Argon2d 3 iterations 128 MiB 8 threads: 0.55 cpb 70.96 Mcycles Argon2id 3 iterations 128 MiB 8 threads: 0.58 cpb 74.02 Mcycles 0.0776 seconds Argon2i 3 iterations 256 MiB 1 threads: 1.75 cpb 447.73 Mcycles Argon2d 3 iterations 256 MiB 1 threads: 1.62 cpb 414.48 Mcycles Argon2id 3 iterations 256 MiB 1 threads: 1.62 cpb 415.25 Mcycles 0.4354 seconds Argon2i 3 iterations 256 MiB 2 threads: 1.17 cpb 299.72 Mcycles Argon2d 3 iterations 256 MiB 2 threads: 1.07 cpb 274.17 Mcycles Argon2id 3 iterations 256 MiB 2 threads: 1.14 cpb 291.48 Mcycles 0.3056 seconds Argon2i 3 iterations 256 MiB 4 threads: 0.70 cpb 180.25 Mcycles Argon2d 3 iterations 256 MiB 4 threads: 0.71 cpb 182.79 Mcycles Argon2id 3 iterations 256 MiB 4 threads: 0.70 cpb 180.23 Mcycles 0.1890 seconds Argon2i 3 iterations 256 MiB 8 threads: 0.54 cpb 137.75 Mcycles Argon2d 3 iterations 256 MiB 8 threads: 0.54 cpb 139.23 Mcycles Argon2id 3 iterations 256 MiB 8 threads: 0.53 cpb 134.82 Mcycles 0.1414 seconds
(This blog is getting too long. I will continue in Project: Part3 – Optimizing and porting argon2 package using C and Assembler language(Progress 4))