diff --git a/blis-Reimplemented-bli_cpuid_query-for-ARM.patch b/blis-Reimplemented-bli_cpuid_query-for-ARM.patch new file mode 100644 index 0000000..569c3ab --- /dev/null +++ b/blis-Reimplemented-bli_cpuid_query-for-ARM.patch @@ -0,0 +1,306 @@ +From 1cfe8e2562e5e50769468382626ce36b734741c1 Mon Sep 17 00:00:00 2001 +From: "Field G. Van Zee" +Date: Thu, 5 Sep 2019 16:08:30 -0500 +Subject: [PATCH] Reimplemented bli_cpuid_query() for ARM. + +Details: +- Rewrote bli_cpuid_query() for ARM architectures to use stdio-based + functions such as fopen() and fgets() instead of popen(). The new code + does more or less the same thing as before--searches /proc/cpuinfo for + various strings, which are then parsed in order to determine the + model, part number, and features. Thanks to Dave Love for suggesting + this change in issue #335. +--- + frame/base/bli_cpuid.c | 174 ++++++++++++++++++++++++++----------------------- + frame/base/bli_cpuid.h | 34 +++++----- + 2 files changed, 109 insertions(+), 99 deletions(-) + +diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c +index f5c53fc..c8891f0 100644 +--- a/frame/base/bli_cpuid.c ++++ b/frame/base/bli_cpuid.c +@@ -380,10 +380,12 @@ arch_t bli_cpuid_query_id( void ) + // vendor. + vendor = bli_cpuid_query( &model, &part, &features ); + +- //printf( "vendor = %u\n", vendor ); +- //printf( "model = %u\n", model ); +- //printf( "part = 0x%x\n", part ); +- //printf( "features = %u\n", features ); ++#if 0 ++ printf( "vendor = %u\n", vendor ); ++ printf( "model = %u\n", model ); ++ printf( "part = 0x%x\n", part ); ++ printf( "features = %u\n", features ); ++#endif + + if ( vendor == VENDOR_ARM ) + { +@@ -909,6 +911,8 @@ int vpu_count( void ) + + #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) + ++#define TEMP_BUFFER_SIZE 200 ++ + uint32_t bli_cpuid_query + ( + uint32_t* model, +@@ -919,96 +923,40 @@ uint32_t bli_cpuid_query + *model = MODEL_UNKNOWN; + *part = 0; + *features = 0; +- +-#if 1 +- const char* grep_str1 = "grep -m 1 Processor /proc/cpuinfo"; +- const char* grep_str2 = "grep -m 1 'CPU part' /proc/cpuinfo"; +- const char* grep_str3 = "grep -m 1 Features /proc/cpuinfo"; +-#else +- const char* grep_str1 = "grep -m 1 Processor ./proc_cpuinfo"; +- const char* grep_str2 = "grep -m 1 'CPU part' ./proc_cpuinfo"; +- const char* grep_str3 = "grep -m 1 Features ./proc_cpuinfo"; +-#endif + +- FILE *fd1 = popen( grep_str1, "r"); +- if ( !fd1 ) +- { +- //printf("popen 1 failed\n"); +- return VENDOR_ARM; +- } +- FILE *fd2 = popen( grep_str2, "r"); +- if (!fd2) +- { +- //printf("popen 2 failed\n"); +- pclose(fd1); +- return VENDOR_ARM; +- } +- FILE *fd3 = popen( grep_str3, "r"); +- if (!fd3) +- { +- //printf("popen 3 failed\n"); +- pclose(fd1); +- pclose(fd2); +- return VENDOR_ARM; +- } +- +- uint32_t n1, n2, n3; +- int c; +- +- // First, discover how many chars are in each stream. +- for ( n1 = 0; (c = fgetc(fd1)) != EOF; ++n1 ) continue; +- for ( n2 = 0; (c = fgetc(fd2)) != EOF; ++n2 ) continue; +- for ( n3 = 0; (c = fgetc(fd3)) != EOF; ++n3 ) continue; +- +- //printf( "n1, n2, n3 = %u %u %u\n", n1, n2, n3 ); +- +- // Close the streams. +- pclose( fd1 ); +- pclose( fd2 ); +- pclose( fd3 ); +- +- // Allocate the correct amount of memory for each stream. +- char* proc_str = malloc( ( size_t )( n1 + 1 ) ); +- char* ptno_str = malloc( ( size_t )( n2 + 1 ) ); +- char* feat_str = malloc( ( size_t )( n3 + 1 ) ); +- *proc_str = 0; +- *ptno_str = 0; +- *feat_str = 0; +- +- // Re-open the streams. Note that there is no need to check for errors +- // this time since we're assumign that the contents of /proc/cpuinfo +- // will be the same as before. +- fd1 = popen( grep_str1, "r"); +- fd2 = popen( grep_str2, "r"); +- fd3 = popen( grep_str3, "r"); ++ char* pci_str = "/proc/cpuinfo"; + ++ char proc_str[ TEMP_BUFFER_SIZE ]; ++ char ptno_str[ TEMP_BUFFER_SIZE ]; ++ char feat_str[ TEMP_BUFFER_SIZE ]; + char* r_val; + +- // Now read each stream in its entirety. Nothing should go wrong, but +- // if it does, bail out. +- r_val = fgets( proc_str, n1, fd1 ); +- if ( n1 && r_val == NULL ) bli_abort(); ++ //printf( "bli_cpuid_query(): beginning search\n" ); + +- r_val = fgets( ptno_str, n2, fd2 ); +- if ( n2 && r_val == NULL ) bli_abort(); ++ // Search /proc/cpuinfo for the 'Processor' entry. ++ r_val = find_string_in( "Processor", proc_str, TEMP_BUFFER_SIZE, pci_str ); ++ if ( r_val == NULL ) return VENDOR_ARM; + +- r_val = fgets( feat_str, n3, fd3 ); +- if ( n3 && r_val == NULL ) bli_abort(); ++ // Search /proc/cpuinfo for the 'CPU part' entry. ++ r_val = find_string_in( "CPU part", ptno_str, TEMP_BUFFER_SIZE, pci_str ); ++ if ( r_val == NULL ) return VENDOR_ARM; + +- //printf( "proc_str: %s\n", proc_str ); +- //printf( "ptno_str: %s\n", ptno_str ); +- //printf( "feat_str: %s\n", feat_str ); ++ // Search /proc/cpuinfo for the 'Features' entry. ++ r_val = find_string_in( "Features", feat_str, TEMP_BUFFER_SIZE, pci_str ); ++ if ( r_val == NULL ) return VENDOR_ARM; + +- // Close the streams. +- pclose( fd1 ); +- pclose( fd2 ); +- pclose( fd3 ); ++#if 0 ++ printf( "bli_cpuid_query(): full processor string: %s\n", proc_str ); ++ printf( "bli_cpuid_query(): full part num string: %s\n", ptno_str ); ++ printf( "bli_cpuid_query(): full features string: %s\n", feat_str ); ++#endif + + // Parse the feature string to check for SIMD features. + if ( strstr( feat_str, "neon" ) != NULL || + strstr( feat_str, "asimd" ) != NULL ) + *features |= FEATURE_NEON; +- //printf( "features var: %u\n", *features ); ++ ++ //printf( "bli_cpuid_query(): features var: %u\n", *features ); + + // Parse the processor string to uncover the model. + if ( strstr( proc_str, "ARMv7" ) != NULL ) +@@ -1016,7 +964,8 @@ uint32_t bli_cpuid_query + else if ( strstr( proc_str, "AArch64" ) != NULL || + strstr( proc_str, "ARMv8" ) ) + *model = MODEL_ARMV8; +- //printf( "model: %u\n", *model ); ++ ++ //printf( "bli_cpuid_query(): model: %u\n", *model ); + + // Parse the part number string. + r_val = strstr( ptno_str, "0x" ); +@@ -1024,9 +973,68 @@ uint32_t bli_cpuid_query + { + *part = strtol( r_val, NULL, 16 ); + } +- //printf( "part#: %x\n", *part ); ++ //printf( "bli_cpuid_query(): part#: %x\n", *part ); + + return VENDOR_ARM; + } + ++char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ) ++{ ++ // This function searches for the first line of the file located at ++ // 'filepath' that contains the string 'target' and then copies that ++ // line (actually, the substring of the line starting with 'target') ++ // to 'buffer', which is 'buf_len' bytes long. ++ ++ char* r_val = NULL; ++ ++ // Allocate a temporary local buffer equal to the size of buffer. ++ char* buf_local = malloc( buf_len * sizeof( char ) ); ++ ++ // Open the file stream. ++ FILE* stream = fopen( filepath, "r" ); ++ ++ // Repeatedly read in a line from the stream, storing the contents of ++ // the stream into buf_local. ++ while ( !feof( stream ) ) ++ { ++ // Read in the current line, up to buf_len-1 bytes. ++ r_val = fgets( buf_local, buf_len-1, stream ); ++ ++ //printf( "read line: %s", buf_local ); ++ ++ // fgets() returns the pointer specified by the first argument (in ++ // this case, buf_local) on success and NULL on error. ++ if ( r_val == NULL ) break; ++ ++ // Since fgets() was successful, we can search for the target string ++ // within the current line, as captured in buf_local. ++ r_val = strstr( buf_local, target ); ++ ++ // If the target string was found in buf_local, we save it to buffer. ++ if ( r_val != NULL ) ++ { ++ //printf( " found match to '%s'\n", target ); ++ ++ // Copy the string read by fgets() to the caller's buffer. ++ strncpy( buffer, buf_local, buf_len ); ++ ++ // Make sure that we have a terminating null character by the ++ // end of the buffer. ++ if ( buf_len > 0 ) buffer[ buf_len - 1 ] = '\0'; ++ ++ // Leave the loop since we found the target string. ++ break; ++ } ++ } ++ ++ // Close the file stream. ++ fclose( stream ); ++ ++ // Free the temporary local buffer. ++ free( buf_local ); ++ ++ // Return r_val so the caller knows if we failed. ++ return r_val; ++} ++ + #endif +diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h +index e609dcb..b6ecd3d 100644 +--- a/frame/base/bli_cpuid.h ++++ b/frame/base/bli_cpuid.h +@@ -50,28 +50,28 @@ + #ifndef BLIS_CPUID_H + #define BLIS_CPUID_H + +-arch_t bli_cpuid_query_id( void ); ++arch_t bli_cpuid_query_id( void ); + + // Intel +-bool_t bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); +-bool_t bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); +-bool_t bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); +-bool_t bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); +-bool_t bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); ++bool_t bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); ++bool_t bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); ++bool_t bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); ++bool_t bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); ++bool_t bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); + + // AMD +-bool_t bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); +-bool_t bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); +-bool_t bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); +-bool_t bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); +-bool_t bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); ++bool_t bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); ++bool_t bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); ++bool_t bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); ++bool_t bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); ++bool_t bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); + + // ARM +-bool_t bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); +-bool_t bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); +-bool_t bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); +-bool_t bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); +-bool_t bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); ++bool_t bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); ++bool_t bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); ++bool_t bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); ++bool_t bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); ++bool_t bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); + + uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); + +@@ -156,6 +156,8 @@ enum + + #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) + ++char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); ++ + enum + { + VENDOR_ARM = 0, +-- +1.8.3.1 + diff --git a/blis-Use-funsafe-math-optimizations-and-ffp-contract-fast.patch b/blis-Use-funsafe-math-optimizations-and-ffp-contract-fast.patch new file mode 100644 index 0000000..73a1e90 --- /dev/null +++ b/blis-Use-funsafe-math-optimizations-and-ffp-contract-fast.patch @@ -0,0 +1,537 @@ +From 138d403b6bb15e687a3fe26d3d967b8ccd1ed97b Mon Sep 17 00:00:00 2001 +From: Devin Matthews +Date: Mon, 26 Aug 2019 18:11:27 -0500 +Subject: [PATCH] Use -funsafe-math-optimizations and -ffp-contract=fast for + all reference kernels when using gcc or clang. (#331) + +--- + config/amd64/make_defs.mk | 6 +++++- + config/arm32/make_defs.mk | 8 ++++++++ + config/arm64/make_defs.mk | 8 ++++++++ + config/bgq/make_defs.mk | 8 ++++++++ + config/bulldozer/make_defs.mk | 6 +++++- + config/cortexa15/make_defs.mk | 8 ++++++++ + config/cortexa53/make_defs.mk | 8 ++++++++ + config/cortexa57/make_defs.mk | 8 ++++++++ + config/cortexa9/make_defs.mk | 8 ++++++++ + config/excavator/make_defs.mk | 6 +++++- + config/generic/make_defs.mk | 6 +++++- + config/haswell/make_defs.mk | 6 +++++- + config/intel64/make_defs.mk | 6 +++++- + config/knc/make_defs.mk | 6 +++++- + config/knl/make_defs.mk | 4 ++-- + config/penryn/make_defs.mk | 6 +++++- + config/piledriver/make_defs.mk | 6 +++++- + config/power7/make_defs.mk | 8 ++++++++ + config/power9/make_defs.mk | 8 ++++++++ + config/sandybridge/make_defs.mk | 6 +++++- + config/skx/make_defs.mk | 4 ++-- + config/steamroller/make_defs.mk | 6 +++++- + config/thunderx2/make_defs.mk | 8 ++++++++ + config/x86_64/make_defs.mk | 6 +++++- + config/zen/make_defs.mk | 6 +++++- + 25 files changed, 149 insertions(+), 17 deletions(-) + +diff --git a/config/amd64/make_defs.mk b/config/amd64/make_defs.mk +index 70c0b69..df7cd20 100644 +--- a/config/amd64/make_defs.mk ++++ b/config/amd64/make_defs.mk +@@ -75,10 +75,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/arm32/make_defs.mk b/config/arm32/make_defs.mk +index b592851..0b517a1 100644 +--- a/config/arm32/make_defs.mk ++++ b/config/arm32/make_defs.mk +@@ -70,7 +70,15 @@ endif + + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) ++ifeq ($(CC_VENDOR),gcc) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else + CRVECFLAGS := $(CKVECFLAGS) ++endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/arm64/make_defs.mk b/config/arm64/make_defs.mk +index ac1cd69..5ffb081 100644 +--- a/config/arm64/make_defs.mk ++++ b/config/arm64/make_defs.mk +@@ -70,7 +70,15 @@ endif + + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) ++ifeq ($(CC_VENDOR),gcc) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else + CRVECFLAGS := $(CKVECFLAGS) ++endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk +index a577a9a..97ea5a5 100644 +--- a/config/bgq/make_defs.mk ++++ b/config/bgq/make_defs.mk +@@ -79,7 +79,15 @@ endif + + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) ++ifeq ($(CC_VENDOR),gcc) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else + CRVECFLAGS := $(CKVECFLAGS) ++endif ++endif + + # Override the default value for LDFLAGS. + ifeq ($(CC_VENDOR),ibm) +diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk +index dec89a4..8f71da3 100644 +--- a/config/bulldozer/make_defs.mk ++++ b/config/bulldozer/make_defs.mk +@@ -75,10 +75,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/cortexa15/make_defs.mk b/config/cortexa15/make_defs.mk +index ee4d301..0cbf304 100644 +--- a/config/cortexa15/make_defs.mk ++++ b/config/cortexa15/make_defs.mk +@@ -70,7 +70,15 @@ endif + + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) ++ifeq ($(CC_VENDOR),gcc) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else + CRVECFLAGS := $(CKVECFLAGS) ++endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/cortexa53/make_defs.mk b/config/cortexa53/make_defs.mk +index 9f723bc..3e116cd 100644 +--- a/config/cortexa53/make_defs.mk ++++ b/config/cortexa53/make_defs.mk +@@ -70,7 +70,15 @@ endif + + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) ++ifeq ($(CC_VENDOR),gcc) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else + CRVECFLAGS := $(CKVECFLAGS) ++endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/cortexa57/make_defs.mk b/config/cortexa57/make_defs.mk +index 23bcf51..864872b 100644 +--- a/config/cortexa57/make_defs.mk ++++ b/config/cortexa57/make_defs.mk +@@ -70,7 +70,15 @@ endif + + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) ++ifeq ($(CC_VENDOR),gcc) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else + CRVECFLAGS := $(CKVECFLAGS) ++endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/cortexa9/make_defs.mk b/config/cortexa9/make_defs.mk +index 2adc40e..310b75b 100644 +--- a/config/cortexa9/make_defs.mk ++++ b/config/cortexa9/make_defs.mk +@@ -70,7 +70,15 @@ endif + + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) ++ifeq ($(CC_VENDOR),gcc) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else + CRVECFLAGS := $(CKVECFLAGS) ++endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/excavator/make_defs.mk b/config/excavator/make_defs.mk +index deb85c7..ed73d5d 100644 +--- a/config/excavator/make_defs.mk ++++ b/config/excavator/make_defs.mk +@@ -75,10 +75,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/generic/make_defs.mk b/config/generic/make_defs.mk +index 3388291..7f934de 100644 +--- a/config/generic/make_defs.mk ++++ b/config/generic/make_defs.mk +@@ -79,10 +79,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk +index 3f49f19..6752dde 100644 +--- a/config/haswell/make_defs.mk ++++ b/config/haswell/make_defs.mk +@@ -83,10 +83,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) #-funsafe-math-optimizations ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/intel64/make_defs.mk b/config/intel64/make_defs.mk +index af462fd..f74fb4d 100644 +--- a/config/intel64/make_defs.mk ++++ b/config/intel64/make_defs.mk +@@ -79,10 +79,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/knc/make_defs.mk b/config/knc/make_defs.mk +index be3c901..d585219 100644 +--- a/config/knc/make_defs.mk ++++ b/config/knc/make_defs.mk +@@ -71,10 +71,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Override the default value for LDFLAGS. + LDFLAGS := -mmic +diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk +index b08cf1e..aa74df3 100644 +--- a/config/knl/make_defs.mk ++++ b/config/knl/make_defs.mk +@@ -99,13 +99,13 @@ endif + # Note: We use AVX2 for reference kernels instead of AVX-512. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations ++CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast + else + ifeq ($(CC_VENDOR),icc) + CRVECFLAGS := -xMIC-AVX512 + else + ifeq ($(CC_VENDOR),clang) +-CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd ++CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast + else + $(error gcc, icc, or clang is required for this configuration.) + endif +diff --git a/config/penryn/make_defs.mk b/config/penryn/make_defs.mk +index 41d2d93..573382e 100644 +--- a/config/penryn/make_defs.mk ++++ b/config/penryn/make_defs.mk +@@ -79,10 +79,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk +index bb23fbe..8cf3ac5 100644 +--- a/config/piledriver/make_defs.mk ++++ b/config/piledriver/make_defs.mk +@@ -75,10 +75,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk +index 18f111b..9633b4f 100644 +--- a/config/power7/make_defs.mk ++++ b/config/power7/make_defs.mk +@@ -70,7 +70,15 @@ endif + + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) ++ifeq ($(CC_VENDOR),gcc) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else + CRVECFLAGS := $(CKVECFLAGS) ++endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/power9/make_defs.mk b/config/power9/make_defs.mk +index 3d66f60..b2c78b1 100644 +--- a/config/power9/make_defs.mk ++++ b/config/power9/make_defs.mk +@@ -70,7 +70,15 @@ endif + + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) ++ifeq ($(CC_VENDOR),gcc) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else + CRVECFLAGS := $(CKVECFLAGS) ++endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk +index 285c4d5..896cb89 100644 +--- a/config/sandybridge/make_defs.mk ++++ b/config/sandybridge/make_defs.mk +@@ -83,10 +83,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/skx/make_defs.mk b/config/skx/make_defs.mk +index 27bea5e..920b42d 100644 +--- a/config/skx/make_defs.mk ++++ b/config/skx/make_defs.mk +@@ -89,13 +89,13 @@ endif + # to overcome the AVX-512 frequency drop". (Issue #187) + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations ++CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast + else + ifeq ($(CC_VENDOR),icc) + CRVECFLAGS := -xCORE-AVX2 + else + ifeq ($(CC_VENDOR),clang) +-CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd ++CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast + else + $(error gcc, icc, or clang is required for this configuration.) + endif +diff --git a/config/steamroller/make_defs.mk b/config/steamroller/make_defs.mk +index a5b6707..89c7689 100644 +--- a/config/steamroller/make_defs.mk ++++ b/config/steamroller/make_defs.mk +@@ -75,10 +75,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/thunderx2/make_defs.mk b/config/thunderx2/make_defs.mk +index 3227fe2..820919d 100644 +--- a/config/thunderx2/make_defs.mk ++++ b/config/thunderx2/make_defs.mk +@@ -70,7 +70,15 @@ endif + + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) ++ifeq ($(CC_VENDOR),gcc) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else + CRVECFLAGS := $(CKVECFLAGS) ++endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/x86_64/make_defs.mk b/config/x86_64/make_defs.mk +index 4d038ff..520cd42 100644 +--- a/config/x86_64/make_defs.mk ++++ b/config/x86_64/make_defs.mk +@@ -79,10 +79,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +diff --git a/config/zen/make_defs.mk b/config/zen/make_defs.mk +index f0752e9..1b9db53 100644 +--- a/config/zen/make_defs.mk ++++ b/config/zen/make_defs.mk +@@ -83,10 +83,14 @@ endif + # Flags specific to reference kernels. + CROPTFLAGS := $(CKOPTFLAGS) + ifeq ($(CC_VENDOR),gcc) +-CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ++else ++ifeq ($(CC_VENDOR),clang) ++CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast + else + CRVECFLAGS := $(CKVECFLAGS) + endif ++endif + + # Store all of the variables here to new variables containing the + # configuration name. +-- +1.8.3.1 +