diff --git a/.gitignore b/.gitignore
index 779591a..49a47af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 /ntl-9.6.4.tar.gz
 /ntl-9.7.0.tar.gz
 /ntl-9.8.0.tar.gz
+/ntl-9.9.1.tar.gz
diff --git a/ntl-loadtime-cpu.patch b/ntl-loadtime-cpu.patch
index 117711f..3ee9724 100644
--- a/ntl-loadtime-cpu.patch
+++ b/ntl-loadtime-cpu.patch
@@ -1,6 +1,6 @@
---- doc/config.txt.orig	2016-04-26 05:40:15.000000000 -0600
-+++ doc/config.txt	2016-04-26 18:22:38.925719916 -0600
-@@ -291,6 +291,7 @@ NTL_GF2X_NOINLINE=off
+--- doc/config.txt.orig	2016-06-02 04:59:09.000000000 -0600
++++ doc/config.txt	2016-06-02 09:48:29.369273869 -0600
+@@ -292,6 +292,7 @@ NTL_GF2X_NOINLINE=off
  NTL_GF2X_ALTCODE=off
  NTL_GF2X_ALTCODE1=off
  NTL_PCLMUL=off
@@ -8,7 +8,7 @@
  
  GMP_INCDIR=$(GMP_PREFIX)/include
  GMP_LIBDIR=$(GMP_PREFIX)/lib
-@@ -638,6 +639,10 @@ NTL_PCLMUL=off
+@@ -644,6 +645,10 @@ NTL_PCLMUL=off
  # switch to enable the PCLMUL instruction on x86 machines for faster arithmetic
  # over GF(2)[X] (without relying on the gf2x package)
  
@@ -19,9 +19,9 @@
  
  
  ########## More GMP Options:
---- include/NTL/config.h.orig	2016-04-26 05:40:16.000000000 -0600
-+++ include/NTL/config.h	2016-04-26 18:22:38.929719590 -0600
-@@ -616,6 +616,23 @@ using the configure script.
+--- include/NTL/config.h.orig	2016-06-02 04:59:09.000000000 -0600
++++ include/NTL/config.h	2016-06-02 09:48:29.374273466 -0600
+@@ -625,6 +625,23 @@ using the configure script.
  #endif
  
  
@@ -45,8 +45,8 @@
  
  
  
---- include/NTL/ctools.h.orig	2016-04-26 05:40:15.000000000 -0600
-+++ include/NTL/ctools.h	2016-04-26 18:22:38.930719509 -0600
+--- include/NTL/ctools.h.orig	2016-06-02 04:59:09.000000000 -0600
++++ include/NTL/ctools.h	2016-06-02 09:48:29.376273304 -0600
 @@ -422,6 +422,137 @@ void _ntl_swap(T*& a, T*& b)
  // this should be big enough to satisfy any SIMD instructions,
  // and it should also be as big as a cache line
@@ -185,9 +185,9 @@
  
     
  
---- include/NTL/def_config.h.orig	2016-04-26 05:40:15.000000000 -0600
-+++ include/NTL/def_config.h	2016-04-26 18:22:38.930719509 -0600
-@@ -616,6 +616,22 @@ using the configure script.
+--- include/NTL/def_config.h.orig	2016-06-02 04:59:09.000000000 -0600
++++ include/NTL/def_config.h	2016-06-02 09:48:29.377273224 -0600
+@@ -625,6 +625,22 @@ using the configure script.
  #endif
  
  
@@ -210,9 +210,9 @@
  
  
  
---- src/cfile.orig	2016-04-26 05:40:15.000000000 -0600
-+++ src/cfile	2016-04-26 18:22:38.931719428 -0600
-@@ -616,6 +616,23 @@ using the configure script.
+--- src/cfile.orig	2016-06-02 04:59:09.000000000 -0600
++++ src/cfile	2016-06-02 09:48:29.377273224 -0600
+@@ -625,6 +625,23 @@ using the configure script.
  #endif
  
  
@@ -236,20 +236,21 @@
  
  @{WIZARD_HACK}
  
---- src/DispSettings.c.orig	2016-04-26 05:40:15.000000000 -0600
-+++ src/DispSettings.c	2016-04-26 18:22:38.931719428 -0600
-@@ -186,6 +186,9 @@ cout << "Performance Options:\n";
+--- src/DispSettings.c.orig	2016-06-02 04:59:09.000000000 -0600
++++ src/DispSettings.c	2016-06-02 09:50:13.512867963 -0600
+@@ -191,6 +191,10 @@ cout << "Performance Options:\n";
     cout << "NTL_PCLMUL\n";
  #endif
  
 +#ifdef NTL_LOADTIME_CPU
 +   cout << "NTL_LOADTIME_CPU\n";
 +#endif
++
  
+    cout << "***************************/\n";
     cout << "\n\n";
- 
---- src/DoConfig.orig	2016-04-26 05:40:15.000000000 -0600
-+++ src/DoConfig	2016-04-26 18:24:47.237292382 -0600
+--- src/DoConfig.orig	2016-06-02 04:59:09.000000000 -0600
++++ src/DoConfig	2016-06-02 09:48:29.379273062 -0600
 @@ -1,7 +1,7 @@
  # This is a perl script, invoked from a shell
  
@@ -259,7 +260,7 @@
  
  
  %MakeFlag = (
-@@ -82,6 +82,7 @@
+@@ -83,6 +83,7 @@
  'NTL_RANGE_CHECK'         => 'off',
  'NTL_FFT_BIGTAB'          => 'off',
  'NTL_FFT_LAZYMUL'         => 'off',
@@ -267,7 +268,7 @@
  
  );
  
-@@ -148,6 +149,15 @@ if ($ConfigFlag{'NTL_THREADS'} eq 'on' &
+@@ -149,6 +150,15 @@ if ($ConfigFlag{'NTL_THREADS'} eq 'on' &
  }
  
  
@@ -283,8 +284,8 @@
  
  # some special MakeVal values that are determined by SHARED
  
---- src/GF2X1.c.orig	2016-04-26 05:40:15.000000000 -0600
-+++ src/GF2X1.c	2016-04-26 18:22:38.933719265 -0600
+--- src/GF2X1.c.orig	2016-06-02 04:59:08.000000000 -0600
++++ src/GF2X1.c	2016-06-02 09:48:29.381272901 -0600
 @@ -19,7 +19,7 @@
  // simple scaling factor for some crossover points:
  // we use a lower crossover of the underlying multiplication
@@ -294,8 +295,8 @@
  #define XOVER_SCALE (1L)
  #else
  #define XOVER_SCALE (2L)
---- src/GF2X.c.orig	2016-04-26 05:40:15.000000000 -0600
-+++ src/GF2X.c	2016-04-26 18:22:38.933719265 -0600
+--- src/GF2X.c.orig	2016-06-02 04:59:08.000000000 -0600
++++ src/GF2X.c	2016-06-02 09:48:29.382272820 -0600
 @@ -31,6 +31,22 @@ pclmul_mul1 (unsigned long *c, unsigned
     __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0));
     _mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0));
@@ -639,9 +640,9 @@
  
  
  void LeftShift(GF2X& c, const GF2X& a, long n)
---- src/InitSettings.c.orig	2016-04-26 05:40:15.000000000 -0600
-+++ src/InitSettings.c	2016-04-26 20:06:27.078557786 -0600
-@@ -150,6 +150,11 @@ int main()
+--- src/InitSettings.c.orig	2016-06-02 04:59:09.000000000 -0600
++++ src/InitSettings.c	2016-06-02 09:48:29.382272820 -0600
+@@ -156,6 +156,11 @@ int main()
     cout << "NTL_RANGE_CHECK=0\n";
  #endif
  
@@ -653,8 +654,8 @@
  
  // the following is synthetically defined
  #ifdef NTL_LONGLONG_SP_MULMOD
---- src/lzz_pX1.c.orig	2016-04-26 05:40:15.000000000 -0600
-+++ src/lzz_pX1.c	2016-04-26 18:22:38.934719184 -0600
+--- src/lzz_pX1.c.orig	2016-06-02 04:59:08.000000000 -0600
++++ src/lzz_pX1.c	2016-06-02 09:48:29.383272740 -0600
 @@ -4,6 +4,12 @@
  
  #ifdef NTL_HAVE_AVX
@@ -1083,8 +1084,8 @@
  
     default:
        LogicError("CompMod: bad strategy");
---- src/mat_lzz_p.c.orig	2016-04-26 05:40:15.000000000 -0600
-+++ src/mat_lzz_p.c	2016-04-26 18:22:38.937718940 -0600
+--- src/mat_lzz_p.c.orig	2016-06-02 04:59:08.000000000 -0600
++++ src/mat_lzz_p.c	2016-06-02 09:48:29.385272578 -0600
 @@ -10,6 +10,15 @@
  
  #ifdef NTL_HAVE_AVX
@@ -1101,7 +1102,7 @@
  #endif
  
  NTL_START_IMPL
-@@ -632,7 +641,7 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
+@@ -626,7 +635,7 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
  
  #ifdef NTL_HAVE_LL_TYPE
  
@@ -1110,7 +1111,7 @@
  
  #define MAX_DBL_INT ((1L << NTL_DOUBLE_PRECISION)-1)
  // max int representable exactly as a double
-@@ -646,19 +655,120 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
+@@ -640,19 +649,120 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
  
  
  // MUL_ADD(a, b, c): a += b*c
@@ -1237,7 +1238,7 @@
     __m256d acc0=_mm256_load_pd(x + 0*4);
     __m256d acc1=_mm256_load_pd(x + 1*4);
     __m256d acc2=_mm256_load_pd(x + 2*4);
-@@ -668,19 +778,82 @@ void muladd1_by_32(double *x, const doub
+@@ -662,19 +772,82 @@ void muladd1_by_32(double *x, const doub
     __m256d acc6=_mm256_load_pd(x + 6*4);
     __m256d acc7=_mm256_load_pd(x + 7*4);
  
@@ -1330,7 +1331,7 @@
     }
  
  
-@@ -694,6 +867,9 @@ void muladd1_by_32(double *x, const doub
+@@ -688,6 +861,9 @@ void muladd1_by_32(double *x, const doub
     _mm256_store_pd(x + 7*4, acc7);
  }
  
@@ -1340,7 +1341,7 @@
  #else
  
  static
-@@ -800,7 +976,164 @@ void muladd1_by_32(double *x, const doub
+@@ -794,7 +970,164 @@ void muladd1_by_32(double *x, const doub
  #endif
  
  // experiment: process two rows at a time
@@ -1506,7 +1507,7 @@
  static
  void muladd2_by_32(double *x, const double *a, const double *b, long n)
  {
-@@ -876,96 +1209,217 @@ void muladd2_by_32(double *x, const doub
+@@ -870,96 +1203,217 @@ void muladd2_by_32(double *x, const doub
     _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
  
  }
@@ -1788,7 +1789,7 @@
  
  static
  void muladd3_by_32(double *x, const double *a, const double *b, long n)
-@@ -1066,6 +1520,32 @@ void muladd3_by_32(double *x, const doub
+@@ -1060,6 +1514,32 @@ void muladd3_by_32(double *x, const doub
  
  }
  
@@ -1821,7 +1822,7 @@
  static inline
  void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n)
  {
-@@ -1085,8 +1565,79 @@ void muladd_all_by_32(long first, long l
+@@ -1079,8 +1559,79 @@ void muladd_all_by_32(long first, long l
  #endif
  }
  
@@ -1901,7 +1902,7 @@
  static inline
  void muladd_interval(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
  {
-@@ -1117,9 +1668,109 @@ void muladd_interval(double * NTL_RESTRI
+@@ -1111,9 +1662,109 @@ void muladd_interval(double * NTL_RESTRI
        _mm256_store_pd(x + 3*4, xvec3);
     }
  }
@@ -2011,7 +2012,7 @@
  static inline
  void muladd_interval1(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
  {
-@@ -1165,11 +1816,74 @@ void muladd_interval1(double * NTL_RESTR
+@@ -1159,11 +1810,74 @@ void muladd_interval1(double * NTL_RESTR
        *x += (*y)*c;
     }
  }
@@ -2087,7 +2088,7 @@
  static inline
  void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
  {
-@@ -1197,13 +1911,6 @@ void muladd_interval2(double * NTL_RESTR
+@@ -1191,13 +1905,6 @@ void muladd_interval2(double * NTL_RESTR
     }
     
  }
@@ -2101,7 +2102,7 @@
  #endif
  
  #endif
-@@ -2245,10 +2952,10 @@ void alt_mul_LL(const mat_window_zz_p& X
+@@ -2031,10 +2738,10 @@ void alt_mul_LL(const mat_window_zz_p& X
  }  
  
  
@@ -2115,7 +2116,7 @@
                  const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)  
  {  
     long n = A.NumRows();  
-@@ -2615,8 +3322,9 @@ void mul_base (const mat_window_zz_p& X,
+@@ -2401,8 +3108,9 @@ void mul_base (const mat_window_zz_p& X,
  
        long V = MAT_BLK_SZ*4;
  
@@ -2127,7 +2128,7 @@
            V <= (MAX_DBL_INT-(p-1))/(p-1) &&
            V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
  
-@@ -2696,7 +3404,8 @@ void mul_strassen(const mat_window_zz_p&
+@@ -2482,7 +3190,8 @@ void mul_strassen(const mat_window_zz_p&
      // this code determines if mul_base triggers blk_mul_DD,
      // in which case a higher crossover is used
  
@@ -2137,7 +2138,7 @@
      {
         long V = MAT_BLK_SZ*4;
         long p = zz_p::modulus();
-@@ -3196,10 +3905,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con
+@@ -2982,10 +3691,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con
  
  
  
@@ -2151,7 +2152,7 @@
  {
     long n = A.NumRows();
  
-@@ -3365,10 +4074,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co
+@@ -3151,10 +3860,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co
  
  
  
@@ -2165,7 +2166,7 @@
  {
     long n = A.NumRows();
  
-@@ -4126,8 +4835,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
+@@ -3912,8 +4621,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
     else if (n/MAT_BLK_SZ < 4) {
        long V = 64;
  
@@ -2177,7 +2178,7 @@
            V <= (MAX_DBL_INT-(p-1))/(p-1) &&
            V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
  
-@@ -4152,8 +4862,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
+@@ -3938,8 +4648,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
     else {
        long V = 4*MAT_BLK_SZ;
  
@@ -2189,7 +2190,7 @@
            V <= (MAX_DBL_INT-(p-1))/(p-1) &&
            V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
  
-@@ -4559,10 +5270,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p&
+@@ -4345,10 +5056,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p&
  
  
  
@@ -2203,7 +2204,7 @@
                 vec_zz_p *xp, bool trans, bool relax)
  {
     long n = A.NumRows();
-@@ -4749,10 +5460,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p&
+@@ -4535,10 +5246,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p&
  
  
  
@@ -2217,7 +2218,7 @@
                 vec_zz_p *xp, bool trans, bool relax)
  {
     long n = A.NumRows();
-@@ -5563,8 +6274,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
+@@ -5349,8 +6060,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
     else if (n/MAT_BLK_SZ < 4) {
        long V = 64;
  
@@ -2229,7 +2230,7 @@
            V <= (MAX_DBL_INT-(p-1))/(p-1) &&
            V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
  
-@@ -5589,8 +6301,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
+@@ -5375,8 +6087,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
     else {
        long V = 4*MAT_BLK_SZ;
  
@@ -2241,7 +2242,7 @@
            V <= (MAX_DBL_INT-(p-1))/(p-1) &&
            V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
  
-@@ -5836,7 +6549,7 @@ long elim_basic(const mat_zz_p& A, mat_z
+@@ -5622,7 +6335,7 @@ long elim_basic(const mat_zz_p& A, mat_z
  #ifdef NTL_HAVE_LL_TYPE
  
  
@@ -2250,7 +2251,7 @@
  
  
  static inline
-@@ -7289,8 +8002,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i
+@@ -7075,8 +7788,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i
     else {
        long V = 4*MAT_BLK_SZ;
  
@@ -2262,9 +2263,9 @@
            V <= (MAX_DBL_INT-(p-1))/(p-1) &&
            V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
  
---- src/QuickTest.c.orig	2016-04-26 05:40:15.000000000 -0600
-+++ src/QuickTest.c	2016-04-26 18:22:38.938718859 -0600
-@@ -335,6 +335,9 @@ cerr << "Performance Options:\n";
+--- src/QuickTest.c.orig	2016-06-02 04:59:09.000000000 -0600
++++ src/QuickTest.c	2016-06-02 09:48:29.386272498 -0600
+@@ -339,6 +339,9 @@ cerr << "Performance Options:\n";
     cerr << "NTL_PCLMUL\n";
  #endif
  
@@ -2274,8 +2275,8 @@
  
     cerr << "\n\n";
  
---- src/WizardAux.orig	2016-04-26 05:40:15.000000000 -0600
-+++ src/WizardAux	2016-04-26 18:22:38.938718859 -0600
+--- src/WizardAux.orig	2016-06-02 04:59:09.000000000 -0600
++++ src/WizardAux	2016-06-02 09:48:29.386272498 -0600
 @@ -94,6 +94,7 @@ system("make InitSettings");
  'NTL_PCLMUL'           => 0,
  'NTL_FFT_BIGTAB'       => 0,
diff --git a/ntl.spec b/ntl.spec
index 89f1902..fa67172 100644
--- a/ntl.spec
+++ b/ntl.spec
@@ -10,7 +10,7 @@
 
 Summary: High-performance algorithms for vectors, matrices, and polynomials 
 Name:    ntl 
-Version: 9.8.0
+Version: 9.9.1
 Release: 1%{?dist}
 
 License: GPLv2+
@@ -136,7 +136,7 @@ done
 %files
 %doc README
 %license doc/copying.txt
-%{_libdir}/libntl.so.24*
+%{_libdir}/libntl.so.27*
 
 %files devel 
 %doc doc/*
@@ -150,6 +150,9 @@ done
 
 
 %changelog
+* Thu Jun  2 2016 Jerry James <loganjerry@gmail.com> - 9.9.1-1
+- ntl-9.9.1
+
 * Fri Apr 29 2016 Jerry James <loganjerry@gmail.com> - 9.8.0-1
 - ntl-9.8.0
 - Add -loadtime-cpu patch
diff --git a/sources b/sources
index d04a3a8..df9dc70 100644
--- a/sources
+++ b/sources
@@ -1 +1 @@
-a7e87d859511c15023169fa0fcf9903b  ntl-9.8.0.tar.gz
+2b189eb103e588d498ead4c6a8b09b3c  ntl-9.9.1.tar.gz