diff --git a/openssl-1.0.0-beta4-aesni.patch b/openssl-1.0.0-beta4-aesni.patch new file mode 100644 index 0000000..f57918b --- /dev/null +++ b/openssl-1.0.0-beta4-aesni.patch @@ -0,0 +1,2388 @@ +diff -up openssl-1.0.0-beta4/Configure.aesni openssl-1.0.0-beta4/Configure +--- openssl-1.0.0-beta4/Configure.aesni 2010-01-07 23:38:31.000000000 +0100 ++++ openssl-1.0.0-beta4/Configure 2010-01-12 22:18:06.000000000 +0100 +@@ -123,11 +123,11 @@ my $tlib="-lnsl -lsocket"; + my $bits1="THIRTY_TWO_BIT "; + my $bits2="SIXTY_FOUR_BIT "; + +-my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-586.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o"; ++my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-586.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o"; + + my $x86_elf_asm="$x86_asm:elf"; + +-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o"; ++my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o"; + my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::void"; + my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::void"; + my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void"; +@@ -491,7 +491,7 @@ my %table=( + # + # Win64 targets, WIN64I denotes IA-64 and WIN64A - AMD64 + "VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ias:win32", +-"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:auto:win32", ++"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:auto:win32", + # x86 Win32 target defaults to ANSI API, if you want UNICODE, complement + # 'perl Configure VC-WIN32' with '-DUNICODE -D_UNICODE' + "VC-WIN32","cl:-W3 -WX -Gs0 -GF -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE:::WIN32::BN_LLONG RC4_INDEX EXPORT_VAR_AS_FN ${x86_gcc_opts}:${x86_asm}:win32n:win32", +@@ -1410,6 +1410,7 @@ if ($rmd160_obj =~ /\.o$/) + if ($aes_obj =~ /\.o$/) + { + $cflags.=" -DAES_ASM"; ++ $aes_obj =~ s/\s*aesni\-x86\.o// if ($no_sse2); + } + else { + $aes_obj=$aes_enc; +diff -up openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86.pl.aesni openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86.pl +--- openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86.pl.aesni 2010-01-12 22:18:06.000000000 +0100 ++++ openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86.pl 2010-01-12 22:18:06.000000000 +0100 +@@ -0,0 +1,765 @@ ++#!/usr/bin/env perl ++ ++# ==================================================================== ++# Written by Andy Polyakov for the OpenSSL ++# project. The module is, however, dual licensed under OpenSSL and ++# CRYPTOGAMS licenses depending on where you obtain it. For further ++# details see http://www.openssl.org/~appro/cryptogams/. ++# ==================================================================== ++# ++# This module implements support for Intel AES-NI extension. In ++# OpenSSL context it's used with Intel engine, but can also be used as ++# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for ++# details]. ++ ++$PREFIX="aesni"; # if $PREFIX is set to "AES", the script ++ # generates drop-in replacement for ++ # crypto/aes/asm/aes-586.pl:-) ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++push(@INC,"${dir}","${dir}../../perlasm"); ++require "x86asm.pl"; ++ ++&asm_init($ARGV[0],$0); ++ ++$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups"); ++ ++$len="eax"; ++$rounds="ecx"; ++$key="edx"; ++$inp="esi"; ++$out="edi"; ++$rounds_="ebx"; # backup copy for $rounds ++$key_="ebp"; # backup copy for $key ++ ++$inout0="xmm0"; ++$inout1="xmm1"; ++$inout2="xmm2"; ++$rndkey0="xmm3"; ++$rndkey1="xmm4"; ++$ivec="xmm5"; ++$in0="xmm6"; ++$in1="xmm7"; $inout3="xmm7"; ++ ++# Inline version of internal aesni_[en|de]crypt1 ++sub aesni_inline_generate1 ++{ my $p=shift; ++ ++ &$movekey ($rndkey0,&QWP(0,$key)); ++ &$movekey ($rndkey1,&QWP(16,$key)); ++ &lea ($key,&DWP(32,$key)); ++ &pxor ($inout0,$rndkey0); ++ &set_label("${p}1_loop"); ++ eval"&aes${p} ($inout0,$rndkey1)"; ++ &dec ($rounds); ++ &$movekey ($rndkey1,&QWP(0,$key)); ++ &lea ($key,&DWP(16,$key)); ++ &jnz (&label("${p}1_loop")); ++ eval"&aes${p}last ($inout0,$rndkey1)"; ++} ++ ++sub aesni_generate1 # fully unrolled loop ++{ my $p=shift; ++ ++ &function_begin_B("_aesni_${p}rypt1"); ++ &$movekey ($rndkey0,&QWP(0,$key)); ++ &$movekey ($rndkey1,&QWP(0x10,$key)); ++ &cmp ($rounds,11); ++ &pxor ($inout0,$rndkey0); ++ &$movekey ($rndkey0,&QWP(0x20,$key)); ++ &lea ($key,&DWP(0x30,$key)); ++ &jb (&label("${p}128")); ++ &lea ($key,&DWP(0x20,$key)); ++ &je (&label("${p}192")); ++ &lea ($key,&DWP(0x20,$key)); ++ eval"&aes${p} ($inout0,$rndkey1)"; ++ &$movekey ($rndkey1,&QWP(-0x40,$key)); ++ eval"&aes${p} ($inout0,$rndkey0)"; ++ &$movekey ($rndkey0,&QWP(-0x30,$key)); ++ &set_label("${p}192"); ++ eval"&aes${p} ($inout0,$rndkey1)"; ++ &$movekey ($rndkey1,&QWP(-0x20,$key)); ++ eval"&aes${p} ($inout0,$rndkey0)"; ++ &$movekey ($rndkey0,&QWP(-0x10,$key)); ++ &set_label("${p}128"); ++ eval"&aes${p} ($inout0,$rndkey1)"; ++ &$movekey ($rndkey1,&QWP(0,$key)); ++ eval"&aes${p} ($inout0,$rndkey0)"; ++ &$movekey ($rndkey0,&QWP(0x10,$key)); ++ eval"&aes${p} ($inout0,$rndkey1)"; ++ &$movekey ($rndkey1,&QWP(0x20,$key)); ++ eval"&aes${p} ($inout0,$rndkey0)"; ++ &$movekey ($rndkey0,&QWP(0x30,$key)); ++ eval"&aes${p} ($inout0,$rndkey1)"; ++ &$movekey ($rndkey1,&QWP(0x40,$key)); ++ eval"&aes${p} ($inout0,$rndkey0)"; ++ &$movekey ($rndkey0,&QWP(0x50,$key)); ++ eval"&aes${p} ($inout0,$rndkey1)"; ++ &$movekey ($rndkey1,&QWP(0x60,$key)); ++ eval"&aes${p} ($inout0,$rndkey0)"; ++ &$movekey ($rndkey0,&QWP(0x70,$key)); ++ eval"&aes${p} ($inout0,$rndkey1)"; ++ eval"&aes${p}last ($inout0,$rndkey0)"; ++ &ret(); ++ &function_end_B("_aesni_${p}rypt1"); ++} ++ ++# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); ++# &aesni_generate1("dec"); ++&function_begin_B("${PREFIX}_encrypt"); ++ &mov ("eax",&wparam(0)); ++ &mov ($key,&wparam(2)); ++ &movups ($inout0,&QWP(0,"eax")); ++ &mov ($rounds,&DWP(240,$key)); ++ &mov ("eax",&wparam(1)); ++ &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1"); ++ &movups (&QWP(0,"eax"),$inout0); ++ &ret (); ++&function_end_B("${PREFIX}_encrypt"); ++ ++# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); ++# &aesni_generate1("dec"); ++&function_begin_B("${PREFIX}_decrypt"); ++ &mov ("eax",&wparam(0)); ++ &mov ($key,&wparam(2)); ++ &movups ($inout0,&QWP(0,"eax")); ++ &mov ($rounds,&DWP(240,$key)); ++ &mov ("eax",&wparam(1)); ++ &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt1"); ++ &movups (&QWP(0,"eax"),$inout0); ++ &ret (); ++&function_end_B("${PREFIX}_decrypt"); ++ ++# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave ++# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] ++# latency is 6, it turned out that it can be scheduled only every ++# *second* cycle. Thus 3x interleave is the one providing optimal ++# utilization, i.e. when subroutine's throughput is virtually same as ++# of non-interleaved subroutine [for number of input blocks up to 3]. ++# This is why it makes no sense to implement 2x subroutine. As soon ++# as/if Intel improves throughput by making it possible to schedule ++# the instructions in question *every* cycles I would have to ++# implement 6x interleave and use it in loop... ++sub aesni_generate3 ++{ my $p=shift; ++ ++ &function_begin_B("_aesni_${p}rypt3"); ++ &$movekey ($rndkey0,&QWP(0,$key)); ++ &shr ($rounds,1); ++ &$movekey ($rndkey1,&QWP(16,$key)); ++ &lea ($key,&DWP(32,$key)); ++ &pxor ($inout0,$rndkey0); ++ &pxor ($inout1,$rndkey0); ++ &pxor ($inout2,$rndkey0); ++ &jmp (&label("${p}3_loop")); ++ &set_label("${p}3_loop",16); ++ eval"&aes${p} ($inout0,$rndkey1)"; ++ &$movekey ($rndkey0,&QWP(0,$key)); ++ eval"&aes${p} ($inout1,$rndkey1)"; ++ &dec ($rounds); ++ eval"&aes${p} ($inout2,$rndkey1)"; ++ &$movekey ($rndkey1,&QWP(16,$key)); ++ eval"&aes${p} ($inout0,$rndkey0)"; ++ &lea ($key,&DWP(32,$key)); ++ eval"&aes${p} ($inout1,$rndkey0)"; ++ eval"&aes${p} ($inout2,$rndkey0)"; ++ &jnz (&label("${p}3_loop")); ++ eval"&aes${p} ($inout0,$rndkey1)"; ++ &$movekey ($rndkey0,&QWP(0,$key)); ++ eval"&aes${p} ($inout1,$rndkey1)"; ++ eval"&aes${p} ($inout2,$rndkey1)"; ++ eval"&aes${p}last ($inout0,$rndkey0)"; ++ eval"&aes${p}last ($inout1,$rndkey0)"; ++ eval"&aes${p}last ($inout2,$rndkey0)"; ++ &ret(); ++ &function_end_B("_aesni_${p}rypt3"); ++} ++ ++# 4x interleave is implemented to improve small block performance, ++# most notably [and naturally] 4 block by ~30%. One can argue that one ++# should have implemented 5x as well, but improvement would be <20%, ++# so it's not worth it... ++sub aesni_generate4 ++{ my $p=shift; ++ ++ &function_begin_B("_aesni_${p}rypt4"); ++ &$movekey ($rndkey0,&QWP(0,$key)); ++ &$movekey ($rndkey1,&QWP(16,$key)); ++ &shr ($rounds,1); ++ &lea ($key,&DWP(32,$key)); ++ &pxor ($inout0,$rndkey0); ++ &pxor ($inout1,$rndkey0); ++ &pxor ($inout2,$rndkey0); ++ &pxor ($inout3,$rndkey0); ++ &jmp (&label("${p}3_loop")); ++ &set_label("${p}3_loop",16); ++ eval"&aes${p} ($inout0,$rndkey1)"; ++ &$movekey ($rndkey0,&QWP(0,$key)); ++ eval"&aes${p} ($inout1,$rndkey1)"; ++ &dec ($rounds); ++ eval"&aes${p} ($inout2,$rndkey1)"; ++ eval"&aes${p} ($inout3,$rndkey1)"; ++ &$movekey ($rndkey1,&QWP(16,$key)); ++ eval"&aes${p} ($inout0,$rndkey0)"; ++ &lea ($key,&DWP(32,$key)); ++ eval"&aes${p} ($inout1,$rndkey0)"; ++ eval"&aes${p} ($inout2,$rndkey0)"; ++ eval"&aes${p} ($inout3,$rndkey0)"; ++ &jnz (&label("${p}3_loop")); ++ eval"&aes${p} ($inout0,$rndkey1)"; ++ &$movekey ($rndkey0,&QWP(0,$key)); ++ eval"&aes${p} ($inout1,$rndkey1)"; ++ eval"&aes${p} ($inout2,$rndkey1)"; ++ eval"&aes${p} ($inout3,$rndkey1)"; ++ eval"&aes${p}last ($inout0,$rndkey0)"; ++ eval"&aes${p}last ($inout1,$rndkey0)"; ++ eval"&aes${p}last ($inout2,$rndkey0)"; ++ eval"&aes${p}last ($inout3,$rndkey0)"; ++ &ret(); ++ &function_end_B("_aesni_${p}rypt4"); ++} ++&aesni_generate3("enc") if ($PREFIX eq "aesni"); ++&aesni_generate3("dec"); ++&aesni_generate4("enc") if ($PREFIX eq "aesni"); ++&aesni_generate4("dec"); ++ ++if ($PREFIX eq "aesni") { ++# void aesni_ecb_encrypt (const void *in, void *out, ++# size_t length, const AES_KEY *key, ++# int enc); ++&function_begin("aesni_ecb_encrypt"); ++ &mov ($inp,&wparam(0)); ++ &mov ($out,&wparam(1)); ++ &mov ($len,&wparam(2)); ++ &mov ($key,&wparam(3)); ++ &mov ($rounds,&wparam(4)); ++ &cmp ($len,16); ++ &jb (&label("ecb_ret")); ++ &and ($len,-16); ++ &test ($rounds,$rounds) ++ &mov ($rounds,&DWP(240,$key)); ++ &mov ($key_,$key); # backup $key ++ &mov ($rounds_,$rounds); # backup $rounds ++ &jz (&label("ecb_decrypt")); ++ ++ &sub ($len,0x40); ++ &jbe (&label("ecb_enc_tail")); ++ &jmp (&label("ecb_enc_loop3")); ++ ++&set_label("ecb_enc_loop3",16); ++ &movups ($inout0,&QWP(0,$inp)); ++ &movups ($inout1,&QWP(0x10,$inp)); ++ &movups ($inout2,&QWP(0x20,$inp)); ++ &call ("_aesni_encrypt3"); ++ &sub ($len,0x30); ++ &lea ($inp,&DWP(0x30,$inp)); ++ &lea ($out,&DWP(0x30,$out)); ++ &movups (&QWP(-0x30,$out),$inout0); ++ &mov ($key,$key_); # restore $key ++ &movups (&QWP(-0x20,$out),$inout1); ++ &mov ($rounds,$rounds_); # restore $rounds ++ &movups (&QWP(-0x10,$out),$inout2); ++ &ja (&label("ecb_enc_loop3")); ++ ++&set_label("ecb_enc_tail"); ++ &add ($len,0x40); ++ &jz (&label("ecb_ret")); ++ ++ &cmp ($len,0x10); ++ &movups ($inout0,&QWP(0,$inp)); ++ &je (&label("ecb_enc_one")); ++ &cmp ($len,0x20); ++ &movups ($inout1,&QWP(0x10,$inp)); ++ &je (&label("ecb_enc_two")); ++ &cmp ($len,0x30); ++ &movups ($inout2,&QWP(0x20,$inp)); ++ &je (&label("ecb_enc_three")); ++ &movups ($inout3,&QWP(0x30,$inp)); ++ &call ("_aesni_encrypt4"); ++ &movups (&QWP(0,$out),$inout0); ++ &movups (&QWP(0x10,$out),$inout1); ++ &movups (&QWP(0x20,$out),$inout2); ++ &movups (&QWP(0x30,$out),$inout3); ++ jmp (&label("ecb_ret")); ++ ++&set_label("ecb_enc_one",16); ++ &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1"); ++ &movups (&QWP(0,$out),$inout0); ++ &jmp (&label("ecb_ret")); ++ ++&set_label("ecb_enc_two",16); ++ &call ("_aesni_encrypt3"); ++ &movups (&QWP(0,$out),$inout0); ++ &movups (&QWP(0x10,$out),$inout1); ++ &jmp (&label("ecb_ret")); ++ ++&set_label("ecb_enc_three",16); ++ &call ("_aesni_encrypt3"); ++ &movups (&QWP(0,$out),$inout0); ++ &movups (&QWP(0x10,$out),$inout1); ++ &movups (&QWP(0x20,$out),$inout2); ++ &jmp (&label("ecb_ret")); ++ ++&set_label("ecb_decrypt",16); ++ &sub ($len,0x40); ++ &jbe (&label("ecb_dec_tail")); ++ &jmp (&label("ecb_dec_loop3")); ++ ++&set_label("ecb_dec_loop3",16); ++ &movups ($inout0,&QWP(0,$inp)); ++ &movups ($inout1,&QWP(0x10,$inp)); ++ &movups ($inout2,&QWP(0x20,$inp)); ++ &call ("_aesni_decrypt3"); ++ &sub ($len,0x30); ++ &lea ($inp,&DWP(0x30,$inp)); ++ &lea ($out,&DWP(0x30,$out)); ++ &movups (&QWP(-0x30,$out),$inout0); ++ &mov ($key,$key_); # restore $key ++ &movups (&QWP(-0x20,$out),$inout1); ++ &mov ($rounds,$rounds_); # restore $rounds ++ &movups (&QWP(-0x10,$out),$inout2); ++ &ja (&label("ecb_dec_loop3")); ++ ++&set_label("ecb_dec_tail"); ++ &add ($len,0x40); ++ &jz (&label("ecb_ret")); ++ ++ &cmp ($len,0x10); ++ &movups ($inout0,&QWP(0,$inp)); ++ &je (&label("ecb_dec_one")); ++ &cmp ($len,0x20); ++ &movups ($inout1,&QWP(0x10,$inp)); ++ &je (&label("ecb_dec_two")); ++ &cmp ($len,0x30); ++ &movups ($inout2,&QWP(0x20,$inp)); ++ &je (&label("ecb_dec_three")); ++ &movups ($inout3,&QWP(0x30,$inp)); ++ &call ("_aesni_decrypt4"); ++ &movups (&QWP(0,$out),$inout0); ++ &movups (&QWP(0x10,$out),$inout1); ++ &movups (&QWP(0x20,$out),$inout2); ++ &movups (&QWP(0x30,$out),$inout3); ++ &jmp (&label("ecb_ret")); ++ ++&set_label("ecb_dec_one",16); ++ &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3"); ++ &movups (&QWP(0,$out),$inout0); ++ &jmp (&label("ecb_ret")); ++ ++&set_label("ecb_dec_two",16); ++ &call ("_aesni_decrypt3"); ++ &movups (&QWP(0,$out),$inout0); ++ &movups (&QWP(0x10,$out),$inout1); ++ &jmp (&label("ecb_ret")); ++ ++&set_label("ecb_dec_three",16); ++ &call ("_aesni_decrypt3"); ++ &movups (&QWP(0,$out),$inout0); ++ &movups (&QWP(0x10,$out),$inout1); ++ &movups (&QWP(0x20,$out),$inout2); ++ ++&set_label("ecb_ret"); ++&function_end("aesni_ecb_encrypt"); ++} ++ ++# void $PREFIX_cbc_encrypt (const void *inp, void *out, ++# size_t length, const AES_KEY *key, ++# unsigned char *ivp,const int enc); ++&function_begin("${PREFIX}_cbc_encrypt"); ++ &mov ($inp,&wparam(0)); ++ &mov ($out,&wparam(1)); ++ &mov ($len,&wparam(2)); ++ &mov ($key,&wparam(3)); ++ &test ($len,$len); ++ &mov ($key_,&wparam(4)); ++ &jz (&label("cbc_ret")); ++ ++ &cmp (&wparam(5),0); ++ &movups ($ivec,&QWP(0,$key_)); # load IV ++ &mov ($rounds,&DWP(240,$key)); ++ &mov ($key_,$key); # backup $key ++ &mov ($rounds_,$rounds); # backup $rounds ++ &je (&label("cbc_decrypt")); ++ ++ &movaps ($inout0,$ivec); ++ &cmp ($len,16); ++ &jb (&label("cbc_enc_tail")); ++ &sub ($len,16); ++ &jmp (&label("cbc_enc_loop")); ++ ++&set_label("cbc_enc_loop",16); ++ &movups ($ivec,&QWP(0,$inp)); ++ &lea ($inp,&DWP(16,$inp)); ++ &pxor ($inout0,$ivec); ++ &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt3"); ++ &sub ($len,16); ++ &lea ($out,&DWP(16,$out)); ++ &mov ($rounds,$rounds_); # restore $rounds ++ &mov ($key,$key_); # restore $key ++ &movups (&QWP(-16,$out),$inout0); ++ &jnc (&label("cbc_enc_loop")); ++ &add ($len,16); ++ &jnz (&label("cbc_enc_tail")); ++ &movaps ($ivec,$inout0); ++ &jmp (&label("cbc_ret")); ++ ++&set_label("cbc_enc_tail"); ++ &mov ("ecx",$len); # zaps $rounds ++ &data_word(0xA4F3F689); # rep movsb ++ &mov ("ecx",16); # zero tail ++ &sub ("ecx",$len); ++ &xor ("eax","eax"); # zaps $len ++ &data_word(0xAAF3F689); # rep stosb ++ &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block ++ &mov ($rounds,$rounds_); # restore $rounds ++ &mov ($inp,$out); # $inp and $out are the same ++ &mov ($key,$key_); # restore $key ++ &jmp (&label("cbc_enc_loop")); ++ ++&set_label("cbc_decrypt",16); ++ &sub ($len,0x40); ++ &jbe (&label("cbc_dec_tail")); ++ &jmp (&label("cbc_dec_loop3")); ++ ++&set_label("cbc_dec_loop3",16); ++ &movups ($inout0,&QWP(0,$inp)); ++ &movups ($inout1,&QWP(0x10,$inp)); ++ &movups ($inout2,&QWP(0x20,$inp)); ++ &movaps ($in0,$inout0); ++ &movaps ($in1,$inout1); ++ &call ("_aesni_decrypt3"); ++ &sub ($len,0x30); ++ &lea ($inp,&DWP(0x30,$inp)); ++ &lea ($out,&DWP(0x30,$out)); ++ &pxor ($inout0,$ivec); ++ &pxor ($inout1,$in0); ++ &movups ($ivec,&QWP(-0x10,$inp)); ++ &pxor ($inout2,$in1); ++ &movups (&QWP(-0x30,$out),$inout0); ++ &mov ($rounds,$rounds_) # restore $rounds ++ &movups (&QWP(-0x20,$out),$inout1); ++ &mov ($key,$key_); # restore $key ++ &movups (&QWP(-0x10,$out),$inout2); ++ &ja (&label("cbc_dec_loop3")); ++ ++&set_label("cbc_dec_tail"); ++ &add ($len,0x40); ++ &jz (&label("cbc_ret")); ++ ++ &movups ($inout0,&QWP(0,$inp)); ++ &cmp ($len,0x10); ++ &movaps ($in0,$inout0); ++ &jbe (&label("cbc_dec_one")); ++ &movups ($inout1,&QWP(0x10,$inp)); ++ &cmp ($len,0x20); ++ &movaps ($in1,$inout1); ++ &jbe (&label("cbc_dec_two")); ++ &movups ($inout2,&QWP(0x20,$inp)); ++ &cmp ($len,0x30); ++ &jbe (&label("cbc_dec_three")); ++ &movups ($inout3,&QWP(0x30,$inp)); ++ &call ("_aesni_decrypt4"); ++ &movups ($rndkey0,&QWP(0x10,$inp)); ++ &movups ($rndkey1,&QWP(0x20,$inp)); ++ &pxor ($inout0,$ivec); ++ &pxor ($inout1,$in0); ++ &movups ($ivec,&QWP(0x30,$inp)); ++ &movups (&QWP(0,$out),$inout0); ++ &pxor ($inout2,$rndkey0); ++ &pxor ($inout3,$rndkey1); ++ &movups (&QWP(0x10,$out),$inout1); ++ &movups (&QWP(0x20,$out),$inout2); ++ &movaps ($inout0,$inout3); ++ &lea ($out,&DWP(0x30,$out)); ++ &jmp (&label("cbc_dec_tail_collected")); ++ ++&set_label("cbc_dec_one"); ++ &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3"); ++ &pxor ($inout0,$ivec); ++ &movaps ($ivec,$in0); ++ &jmp (&label("cbc_dec_tail_collected")); ++ ++&set_label("cbc_dec_two"); ++ &call ("_aesni_decrypt3"); ++ &pxor ($inout0,$ivec); ++ &pxor ($inout1,$in0); ++ &movups (&QWP(0,$out),$inout0); ++ &movaps ($inout0,$inout1); ++ &movaps ($ivec,$in1); ++ &lea ($out,&DWP(0x10,$out)); ++ &jmp (&label("cbc_dec_tail_collected")); ++ ++&set_label("cbc_dec_three"); ++ &call ("_aesni_decrypt3"); ++ &pxor ($inout0,$ivec); ++ &pxor ($inout1,$in0); ++ &pxor ($inout2,$in1); ++ &movups (&QWP(0,$out),$inout0); ++ &movups (&QWP(0x10,$out),$inout1); ++ &movaps ($inout0,$inout2); ++ &movups ($ivec,&QWP(0x20,$inp)); ++ &lea ($out,&DWP(0x20,$out)); ++ ++&set_label("cbc_dec_tail_collected"); ++ &and ($len,15); ++ &jnz (&label("cbc_dec_tail_partial")); ++ &movups (&QWP(0,$out),$inout0); ++ &jmp (&label("cbc_ret")); ++ ++&set_label("cbc_dec_tail_partial"); ++ &mov ($key_,"esp"); ++ &sub ("esp",16); ++ &and ("esp",-16); ++ &movaps (&QWP(0,"esp"),$inout0); ++ &mov ($inp,"esp"); ++ &mov ("ecx",$len); ++ &data_word(0xA4F3F689); # rep movsb ++ &mov ("esp",$key_); ++ ++&set_label("cbc_ret"); ++ &mov ($key_,&wparam(4)); ++ &movups (&QWP(0,$key_),$ivec); # output IV ++&function_end("${PREFIX}_cbc_encrypt"); ++ ++# Mechanical port from aesni-x86_64.pl. ++# ++# _aesni_set_encrypt_key is private interface, ++# input: ++# "eax" const unsigned char *userKey ++# $rounds int bits ++# $key AES_KEY *key ++# output: ++# "eax" return code ++# $round rounds ++ ++&function_begin_B("_aesni_set_encrypt_key"); ++ &test ("eax","eax"); ++ &jz (&label("bad_pointer")); ++ &test ($key,$key); ++ &jz (&label("bad_pointer")); ++ ++ &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey ++ &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 ++ &lea ($key,&DWP(16,$key)); ++ &cmp ($rounds,256); ++ &je (&label("14rounds")); ++ &cmp ($rounds,192); ++ &je (&label("12rounds")); ++ &cmp ($rounds,128); ++ &jne (&label("bad_keybits")); ++ ++&set_label("10rounds",16); ++ &mov ($rounds,9); ++ &$movekey (&QWP(-16,$key),"xmm0"); # round 0 ++ &aeskeygenassist("xmm1","xmm0",0x01); # round 1 ++ &call (&label("key_128_cold")); ++ &aeskeygenassist("xmm1","xmm0",0x2); # round 2 ++ &call (&label("key_128")); ++ &aeskeygenassist("xmm1","xmm0",0x04); # round 3 ++ &call (&label("key_128")); ++ &aeskeygenassist("xmm1","xmm0",0x08); # round 4 ++ &call (&label("key_128")); ++ &aeskeygenassist("xmm1","xmm0",0x10); # round 5 ++ &call (&label("key_128")); ++ &aeskeygenassist("xmm1","xmm0",0x20); # round 6 ++ &call (&label("key_128")); ++ &aeskeygenassist("xmm1","xmm0",0x40); # round 7 ++ &call (&label("key_128")); ++ &aeskeygenassist("xmm1","xmm0",0x80); # round 8 ++ &call (&label("key_128")); ++ &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 ++ &call (&label("key_128")); ++ &aeskeygenassist("xmm1","xmm0",0x36); # round 10 ++ &call (&label("key_128")); ++ &$movekey (&QWP(0,$key),"xmm0"); ++ &mov (&DWP(80,$key),$rounds); ++ &xor ("eax","eax"); ++ &ret(); ++ ++&set_label("key_128",16); ++ &$movekey (&QWP(0,$key),"xmm0"); ++ &lea ($key,&DWP(16,$key)); ++&set_label("key_128_cold"); ++ &shufps ("xmm4","xmm0",0b00010000); ++ &pxor ("xmm0","xmm4"); ++ &shufps ("xmm4","xmm0",0b10001100,); ++ &pxor ("xmm0","xmm4"); ++ &pshufd ("xmm1","xmm1",0b11111111); # critical path ++ &pxor ("xmm0","xmm1"); ++ &ret(); ++ ++&set_label("12rounds",16); ++ &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey ++ &mov ($rounds,11); ++ &$movekey (&QWP(-16,$key),"xmm0") # round 0 ++ &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 ++ &call (&label("key_192a_cold")); ++ &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 ++ &call (&label("key_192b")); ++ &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 ++ &call (&label("key_192a")); ++ &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 ++ &call (&label("key_192b")); ++ &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 ++ &call (&label("key_192a")); ++ &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 ++ &call (&label("key_192b")); ++ &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 ++ &call (&label("key_192a")); ++ &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 ++ &call (&label("key_192b")); ++ &$movekey (&QWP(0,$key),"xmm0"); ++ &mov (&DWP(48,$key),$rounds); ++ &xor ("eax","eax"); ++ &ret(); ++ ++&set_label("key_192a",16); ++ &$movekey (&QWP(0,$key),"xmm0"); ++ &lea ($key,&DWP(16,$key)); ++&set_label("key_192a_cold",16); ++ &movaps ("xmm5","xmm2"); ++&set_label("key_192b_warm"); ++ &shufps ("xmm4","xmm0",0b00010000); ++ &movaps ("xmm3","xmm2"); ++ &pxor ("xmm0","xmm4"); ++ &shufps ("xmm4","xmm0",0b10001100); ++ &pslldq ("xmm3",4); ++ &pxor ("xmm0","xmm4"); ++ &pshufd ("xmm1","xmm1",0b01010101); # critical path ++ &pxor ("xmm2","xmm3"); ++ &pxor ("xmm0","xmm1"); ++ &pshufd ("xmm3","xmm0",0b11111111); ++ &pxor ("xmm2","xmm3"); ++ &ret(); ++ ++&set_label("key_192b",16); ++ &movaps ("xmm3","xmm0"); ++ &shufps ("xmm5","xmm0",0b01000100); ++ &$movekey (&QWP(0,$key),"xmm5"); ++ &shufps ("xmm3","xmm2",0b01001110); ++ &$movekey (&QWP(16,$key),"xmm3"); ++ &lea ($key,&DWP(32,$key)); ++ &jmp (&label("key_192b_warm")); ++ ++&set_label("14rounds",16); ++ &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey ++ &mov ($rounds,13); ++ &lea ($key,&DWP(16,$key)); ++ &$movekey (&QWP(-32,$key),"xmm0"); # round 0 ++ &$movekey (&QWP(-16,$key),"xmm2"); # round 1 ++ &aeskeygenassist("xmm1","xmm2",0x01); # round 2 ++ &call (&label("key_256a_cold")); ++ &aeskeygenassist("xmm1","xmm0",0x01); # round 3 ++ &call (&label("key_256b")); ++ &aeskeygenassist("xmm1","xmm2",0x02); # round 4 ++ &call (&label("key_256a")); ++ &aeskeygenassist("xmm1","xmm0",0x02); # round 5 ++ &call (&label("key_256b")); ++ &aeskeygenassist("xmm1","xmm2",0x04); # round 6 ++ &call (&label("key_256a")); ++ &aeskeygenassist("xmm1","xmm0",0x04); # round 7 ++ &call (&label("key_256b")); ++ &aeskeygenassist("xmm1","xmm2",0x08); # round 8 ++ &call (&label("key_256a")); ++ &aeskeygenassist("xmm1","xmm0",0x08); # round 9 ++ &call (&label("key_256b")); ++ &aeskeygenassist("xmm1","xmm2",0x10); # round 10 ++ &call (&label("key_256a")); ++ &aeskeygenassist("xmm1","xmm0",0x10); # round 11 ++ &call (&label("key_256b")); ++ &aeskeygenassist("xmm1","xmm2",0x20); # round 12 ++ &call (&label("key_256a")); ++ &aeskeygenassist("xmm1","xmm0",0x20); # round 13 ++ &call (&label("key_256b")); ++ &aeskeygenassist("xmm1","xmm2",0x40); # round 14 ++ &call (&label("key_256a")); ++ &$movekey (&QWP(0,$key),"xmm0"); ++ &mov (&DWP(16,$key),$rounds); ++ &xor ("eax","eax"); ++ &ret(); ++ ++&set_label("key_256a",16); ++ &$movekey (&QWP(0,$key),"xmm2"); ++ &lea ($key,&DWP(16,$key)); ++&set_label("key_256a_cold"); ++ &shufps ("xmm4","xmm0",0b00010000); ++ &pxor ("xmm0","xmm4"); ++ &shufps ("xmm4","xmm0",0b10001100); ++ &pxor ("xmm0","xmm4"); ++ &pshufd ("xmm1","xmm1",0b11111111); # critical path ++ &pxor ("xmm0","xmm1"); ++ &ret(); ++ ++&set_label("key_256b",16); ++ &$movekey (&QWP(0,$key),"xmm0"); ++ &lea ($key,&DWP(16,$key)); ++ ++ &shufps ("xmm4","xmm2",0b00010000); ++ &pxor ("xmm2","xmm4"); ++ &shufps ("xmm4","xmm2",0b10001100); ++ &pxor ("xmm2","xmm4"); ++ &pshufd ("xmm1","xmm1",0b10101010); # critical path ++ &pxor ("xmm2","xmm1"); ++ &ret(); ++ ++&set_label("bad_pointer",4); ++ &mov ("eax",-1); ++ &ret (); ++&set_label("bad_keybits",4); ++ &mov ("eax",-2); ++ &ret (); ++&function_end_B("_aesni_set_encrypt_key"); ++ ++# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, ++# AES_KEY *key) ++&function_begin_B("${PREFIX}_set_encrypt_key"); ++ &mov ("eax",&wparam(0)); ++ &mov ($rounds,&wparam(1)); ++ &mov ($key,&wparam(2)); ++ &call ("_aesni_set_encrypt_key"); ++ &ret (); ++&function_end_B("${PREFIX}_set_encrypt_key"); ++ ++# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, ++# AES_KEY *key) ++&function_begin_B("${PREFIX}_set_decrypt_key"); ++ &mov ("eax",&wparam(0)); ++ &mov ($rounds,&wparam(1)); ++ &mov ($key,&wparam(2)); ++ &call ("_aesni_set_encrypt_key"); ++ &mov ($key,&wparam(2)); ++ &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key ++ &test ("eax","eax"); ++ &jnz (&label("dec_key_ret")); ++ &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule ++ ++ &$movekey ("xmm0",&QWP(0,$key)); # just swap ++ &$movekey ("xmm1",&QWP(0,"eax")); ++ &$movekey (&QWP(0,"eax"),"xmm0"); ++ &$movekey (&QWP(0,$key),"xmm1"); ++ &lea ($key,&DWP(16,$key)); ++ &lea ("eax",&DWP(-16,"eax")); ++ ++&set_label("dec_key_inverse"); ++ &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse ++ &$movekey ("xmm1",&QWP(0,"eax")); ++ &aesimc ("xmm0","xmm0"); ++ &aesimc ("xmm1","xmm1"); ++ &lea ($key,&DWP(16,$key)); ++ &lea ("eax",&DWP(-16,"eax")); ++ &cmp ("eax",$key); ++ &$movekey (&QWP(16,"eax"),"xmm0"); ++ &$movekey (&QWP(-16,$key),"xmm1"); ++ &ja (&label("dec_key_inverse")); ++ ++ &$movekey ("xmm0",&QWP(0,$key)); # inverse middle ++ &aesimc ("xmm0","xmm0"); ++ &$movekey (&QWP(0,$key),"xmm0"); ++ ++ &xor ("eax","eax"); # return success ++&set_label("dec_key_ret"); ++ &ret (); ++&function_end_B("${PREFIX}_set_decrypt_key"); ++&asciz("AES for Intel AES-NI, CRYPTOGAMS by "); ++ ++&asm_finish(); +diff -up openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86_64.pl.aesni openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86_64.pl +--- openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86_64.pl.aesni 2010-01-12 22:18:06.000000000 +0100 ++++ openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86_64.pl 2010-01-12 22:18:06.000000000 +0100 +@@ -0,0 +1,991 @@ ++#!/usr/bin/env perl ++# ++# ==================================================================== ++# Written by Andy Polyakov for the OpenSSL ++# project. The module is, however, dual licensed under OpenSSL and ++# CRYPTOGAMS licenses depending on where you obtain it. For further ++# details see http://www.openssl.org/~appro/cryptogams/. ++# ==================================================================== ++# ++# This module implements support for Intel AES-NI extension. In ++# OpenSSL context it's used with Intel engine, but can also be used as ++# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for ++# details]. ++ ++$PREFIX="aesni"; # if $PREFIX is set to "AES", the script ++ # generates drop-in replacement for ++ # crypto/aes/asm/aes-x86_64.pl:-) ++ ++$flavour = shift; ++$output = shift; ++if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } ++ ++$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or ++die "can't locate x86_64-xlate.pl"; ++ ++open STDOUT,"| $^X $xlate $flavour $output"; ++ ++$movkey = $PREFIX eq "aesni" ? "movaps" : "movups"; ++@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ++ ("%rdi","%rsi","%rdx","%rcx"); # Unix order ++ ++$code=".text\n"; ++ ++$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! ++# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... ++$inp="%rdi"; ++$out="%rsi"; ++$len="%rdx"; ++$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! ++$ivp="%r8"; # cbc ++ ++$rnds_="%r10d"; # backup copy for $rounds ++$key_="%r11"; # backup copy for $key ++ ++# %xmm register layout ++$inout0="%xmm0"; $inout1="%xmm1"; ++$inout2="%xmm2"; $inout3="%xmm3"; ++$rndkey0="%xmm4"; $rndkey1="%xmm5"; ++ ++$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt ++$in1="%xmm8"; $in2="%xmm9"; ++ ++# Inline version of internal aesni_[en|de]crypt1. ++# ++# Why folded loop? Because aes[enc|dec] is slow enough to accommodate ++# cycles which take care of loop variables... ++{ my $sn; ++sub aesni_generate1 { ++my ($p,$key,$rounds)=@_; ++++$sn; ++$code.=<<___; ++ $movkey ($key),$rndkey0 ++ $movkey 16($key),$rndkey1 ++ lea 32($key),$key ++ pxor $rndkey0,$inout0 ++.Loop_${p}1_$sn: ++ aes${p} $rndkey1,$inout0 ++ dec $rounds ++ $movkey ($key),$rndkey1 ++ lea 16($key),$key ++ jnz .Loop_${p}1_$sn # loop body is 16 bytes ++ aes${p}last $rndkey1,$inout0 ++___ ++}} ++# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); ++# ++{ my ($inp,$out,$key) = @_4args; ++ ++$code.=<<___; ++.globl ${PREFIX}_encrypt ++.type ${PREFIX}_encrypt,\@abi-omnipotent ++.align 16 ++${PREFIX}_encrypt: ++ movups ($inp),$inout0 # load input ++ mov 240($key),$rounds # pull $rounds ++___ ++ &aesni_generate1("enc",$key,$rounds); ++$code.=<<___; ++ movups $inout0,($out) # output ++ ret ++.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt ++ ++.globl ${PREFIX}_decrypt ++.type ${PREFIX}_decrypt,\@abi-omnipotent ++.align 16 ++${PREFIX}_decrypt: ++ movups ($inp),$inout0 # load input ++ mov 240($key),$rounds # pull $rounds ++___ ++ &aesni_generate1("dec",$key,$rounds); ++$code.=<<___; ++ movups $inout0,($out) # output ++ ret ++.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt ++___ ++} ++ ++# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave ++# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] ++# latency is 6, it turned out that it can be scheduled only every ++# *second* cycle. Thus 3x interleave is the one providing optimal ++# utilization, i.e. when subroutine's throughput is virtually same as ++# of non-interleaved subroutine [for number of input blocks up to 3]. ++# This is why it makes no sense to implement 2x subroutine. As soon ++# as/if Intel improves throughput by making it possible to schedule ++# the instructions in question *every* cycles I would have to ++# implement 6x interleave and use it in loop... ++sub aesni_generate3 { ++my $dir=shift; ++# As already mentioned it takes in $key and $rounds, which are *not* ++# preserved. $inout[0-2] is cipher/clear text... ++$code.=<<___; ++.type _aesni_${dir}rypt3,\@abi-omnipotent ++.align 16 ++_aesni_${dir}rypt3: ++ $movkey ($key),$rndkey0 ++ shr \$1,$rounds ++ $movkey 16($key),$rndkey1 ++ lea 32($key),$key ++ pxor $rndkey0,$inout0 ++ pxor $rndkey0,$inout1 ++ pxor $rndkey0,$inout2 ++ ++.L${dir}_loop3: ++ aes${dir} $rndkey1,$inout0 ++ $movkey ($key),$rndkey0 ++ aes${dir} $rndkey1,$inout1 ++ dec $rounds ++ aes${dir} $rndkey1,$inout2 ++ aes${dir} $rndkey0,$inout0 ++ $movkey 16($key),$rndkey1 ++ aes${dir} $rndkey0,$inout1 ++ lea 32($key),$key ++ aes${dir} $rndkey0,$inout2 ++ jnz .L${dir}_loop3 ++ ++ aes${dir} $rndkey1,$inout0 ++ $movkey ($key),$rndkey0 ++ aes${dir} $rndkey1,$inout1 ++ aes${dir} $rndkey1,$inout2 ++ aes${dir}last $rndkey0,$inout0 ++ aes${dir}last $rndkey0,$inout1 ++ aes${dir}last $rndkey0,$inout2 ++ ret ++.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 ++___ ++} ++# 4x interleave is implemented to improve small block performance, ++# most notably [and naturally] 4 block by ~30%. One can argue that one ++# should have implemented 5x as well, but improvement would be <20%, ++# so it's not worth it... ++sub aesni_generate4 { ++my $dir=shift; ++# As already mentioned it takes in $key and $rounds, which are *not* ++# preserved. $inout[0-3] is cipher/clear text... ++$code.=<<___; ++.type _aesni_${dir}rypt4,\@abi-omnipotent ++.align 16 ++_aesni_${dir}rypt4: ++ $movkey ($key),$rndkey0 ++ shr \$1,$rounds ++ $movkey 16($key),$rndkey1 ++ lea 32($key),$key ++ pxor $rndkey0,$inout0 ++ pxor $rndkey0,$inout1 ++ pxor $rndkey0,$inout2 ++ pxor $rndkey0,$inout3 ++ ++.L${dir}_loop4: ++ aes${dir} $rndkey1,$inout0 ++ $movkey ($key),$rndkey0 ++ aes${dir} $rndkey1,$inout1 ++ dec $rounds ++ aes${dir} $rndkey1,$inout2 ++ aes${dir} $rndkey1,$inout3 ++ aes${dir} $rndkey0,$inout0 ++ $movkey 16($key),$rndkey1 ++ aes${dir} $rndkey0,$inout1 ++ lea 32($key),$key ++ aes${dir} $rndkey0,$inout2 ++ aes${dir} $rndkey0,$inout3 ++ jnz .L${dir}_loop4 ++ ++ aes${dir} $rndkey1,$inout0 ++ $movkey ($key),$rndkey0 ++ aes${dir} $rndkey1,$inout1 ++ aes${dir} $rndkey1,$inout2 ++ aes${dir} $rndkey1,$inout3 ++ aes${dir}last $rndkey0,$inout0 ++ aes${dir}last $rndkey0,$inout1 ++ aes${dir}last $rndkey0,$inout2 ++ aes${dir}last $rndkey0,$inout3 ++ ret ++.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 ++___ ++} ++&aesni_generate3("enc") if ($PREFIX eq "aesni"); ++&aesni_generate3("dec"); ++&aesni_generate4("enc") if ($PREFIX eq "aesni"); ++&aesni_generate4("dec"); ++ ++if ($PREFIX eq "aesni") { ++# void aesni_ecb_encrypt (const void *in, void *out, ++# size_t length, const AES_KEY *key, ++# int enc); ++$code.=<<___; ++.globl aesni_ecb_encrypt ++.type aesni_ecb_encrypt,\@function,5 ++.align 16 ++aesni_ecb_encrypt: ++ cmp \$16,$len # check length ++ jb .Lecb_ret ++ ++ mov 240($key),$rounds # pull $rounds ++ and \$-16,$len ++ mov $key,$key_ # backup $key ++ test %r8d,%r8d # 5th argument ++ mov $rounds,$rnds_ # backup $rounds ++ jz .Lecb_decrypt ++#--------------------------- ECB ENCRYPT ------------------------------# ++ sub \$0x40,$len ++ jbe .Lecb_enc_tail ++ jmp .Lecb_enc_loop3 ++.align 16 ++.Lecb_enc_loop3: ++ movups ($inp),$inout0 ++ movups 0x10($inp),$inout1 ++ movups 0x20($inp),$inout2 ++ call _aesni_encrypt3 ++ sub \$0x30,$len ++ lea 0x30($inp),$inp ++ lea 0x30($out),$out ++ movups $inout0,-0x30($out) ++ mov $rnds_,$rounds # restore $rounds ++ movups $inout1,-0x20($out) ++ mov $key_,$key # restore $key ++ movups $inout2,-0x10($out) ++ ja .Lecb_enc_loop3 ++ ++.Lecb_enc_tail: ++ add \$0x40,$len ++ jz .Lecb_ret ++ ++ cmp \$0x10,$len ++ movups ($inp),$inout0 ++ je .Lecb_enc_one ++ cmp \$0x20,$len ++ movups 0x10($inp),$inout1 ++ je .Lecb_enc_two ++ cmp \$0x30,$len ++ movups 0x20($inp),$inout2 ++ je .Lecb_enc_three ++ movups 0x30($inp),$inout3 ++ call _aesni_encrypt4 ++ movups $inout0,($out) ++ movups $inout1,0x10($out) ++ movups $inout2,0x20($out) ++ movups $inout3,0x30($out) ++ jmp .Lecb_ret ++.align 16 ++.Lecb_enc_one: ++___ ++ &aesni_generate1("enc",$key,$rounds); ++$code.=<<___; ++ movups $inout0,($out) ++ jmp .Lecb_ret ++.align 16 ++.Lecb_enc_two: ++ call _aesni_encrypt3 ++ movups $inout0,($out) ++ movups $inout1,0x10($out) ++ jmp .Lecb_ret ++.align 16 ++.Lecb_enc_three: ++ call _aesni_encrypt3 ++ movups $inout0,($out) ++ movups $inout1,0x10($out) ++ movups $inout2,0x20($out) ++ jmp .Lecb_ret ++ #--------------------------- ECB DECRYPT ------------------------------# ++.align 16 ++.Lecb_decrypt: ++ sub \$0x40,$len ++ jbe .Lecb_dec_tail ++ jmp .Lecb_dec_loop3 ++.align 16 ++.Lecb_dec_loop3: ++ movups ($inp),$inout0 ++ movups 0x10($inp),$inout1 ++ movups 0x20($inp),$inout2 ++ call _aesni_decrypt3 ++ sub \$0x30,$len ++ lea 0x30($inp),$inp ++ lea 0x30($out),$out ++ movups $inout0,-0x30($out) ++ mov $rnds_,$rounds # restore $rounds ++ movups $inout1,-0x20($out) ++ mov $key_,$key # restore $key ++ movups $inout2,-0x10($out) ++ ja .Lecb_dec_loop3 ++ ++.Lecb_dec_tail: ++ add \$0x40,$len ++ jz .Lecb_ret ++ ++ cmp \$0x10,$len ++ movups ($inp),$inout0 ++ je .Lecb_dec_one ++ cmp \$0x20,$len ++ movups 0x10($inp),$inout1 ++ je .Lecb_dec_two ++ cmp \$0x30,$len ++ movups 0x20($inp),$inout2 ++ je .Lecb_dec_three ++ movups 0x30($inp),$inout3 ++ call _aesni_decrypt4 ++ movups $inout0,($out) ++ movups $inout1,0x10($out) ++ movups $inout2,0x20($out) ++ movups $inout3,0x30($out) ++ jmp .Lecb_ret ++.align 16 ++.Lecb_dec_one: ++___ ++ &aesni_generate1("dec",$key,$rounds); ++$code.=<<___; ++ movups $inout0,($out) ++ jmp .Lecb_ret ++.align 16 ++.Lecb_dec_two: ++ call _aesni_decrypt3 ++ movups $inout0,($out) ++ movups $inout1,0x10($out) ++ jmp .Lecb_ret ++.align 16 ++.Lecb_dec_three: ++ call _aesni_decrypt3 ++ movups $inout0,($out) ++ movups $inout1,0x10($out) ++ movups $inout2,0x20($out) ++ ++.Lecb_ret: ++ ret ++.size aesni_ecb_encrypt,.-aesni_ecb_encrypt ++___ ++} ++ ++# void $PREFIX_cbc_encrypt (const void *inp, void *out, ++# size_t length, const AES_KEY *key, ++# unsigned char *ivp,const int enc); ++$reserved = $win64?0x40:-0x18; # used in decrypt ++$code.=<<___; ++.globl ${PREFIX}_cbc_encrypt ++.type ${PREFIX}_cbc_encrypt,\@function,6 ++.align 16 ++${PREFIX}_cbc_encrypt: ++ test $len,$len # check length ++ jz .Lcbc_ret ++ ++ mov 240($key),$rnds_ # pull $rounds ++ mov $key,$key_ # backup $key ++ test %r9d,%r9d # 6th argument ++ jz .Lcbc_decrypt ++#--------------------------- CBC ENCRYPT ------------------------------# ++ movups ($ivp),$inout0 # load iv as initial state ++ cmp \$16,$len ++ mov $rnds_,$rounds ++ jb .Lcbc_enc_tail ++ sub \$16,$len ++ jmp .Lcbc_enc_loop ++.align 16 ++.Lcbc_enc_loop: ++ movups ($inp),$inout1 # load input ++ lea 16($inp),$inp ++ pxor $inout1,$inout0 ++___ ++ &aesni_generate1("enc",$key,$rounds); ++$code.=<<___; ++ sub \$16,$len ++ lea 16($out),$out ++ mov $rnds_,$rounds # restore $rounds ++ mov $key_,$key # restore $key ++ movups $inout0,-16($out) # store output ++ jnc .Lcbc_enc_loop ++ add \$16,$len ++ jnz .Lcbc_enc_tail ++ movups $inout0,($ivp) ++ jmp .Lcbc_ret ++ ++.Lcbc_enc_tail: ++ mov $len,%rcx # zaps $key ++ xchg $inp,$out # $inp is %rsi and $out is %rdi now ++ .long 0x9066A4F3 # rep movsb ++ mov \$16,%ecx # zero tail ++ sub $len,%rcx ++ xor %eax,%eax ++ .long 0x9066AAF3 # rep stosb ++ lea -16(%rdi),%rdi # rewind $out by 1 block ++ mov $rnds_,$rounds # restore $rounds ++ mov %rdi,%rsi # $inp and $out are the same ++ mov $key_,$key # restore $key ++ xor $len,$len # len=16 ++ jmp .Lcbc_enc_loop # one more spin ++ #--------------------------- CBC DECRYPT ------------------------------# ++.align 16 ++.Lcbc_decrypt: ++___ ++$code.=<<___ if ($win64); ++ lea -0x58(%rsp),%rsp ++ movaps %xmm6,(%rsp) ++ movaps %xmm7,0x10(%rsp) ++ movaps %xmm8,0x20(%rsp) ++ movaps %xmm9,0x30(%rsp) ++.Lcbc_decrypt_body: ++___ ++$code.=<<___; ++ movups ($ivp),$iv ++ sub \$0x40,$len ++ mov $rnds_,$rounds ++ jbe .Lcbc_dec_tail ++ jmp .Lcbc_dec_loop3 ++.align 16 ++.Lcbc_dec_loop3: ++ movups ($inp),$inout0 ++ movups 0x10($inp),$inout1 ++ movups 0x20($inp),$inout2 ++ movaps $inout0,$in0 ++ movaps $inout1,$in1 ++ movaps $inout2,$in2 ++ call _aesni_decrypt3 ++ sub \$0x30,$len ++ lea 0x30($inp),$inp ++ lea 0x30($out),$out ++ pxor $iv,$inout0 ++ pxor $in0,$inout1 ++ movaps $in2,$iv ++ pxor $in1,$inout2 ++ movups $inout0,-0x30($out) ++ mov $rnds_,$rounds # restore $rounds ++ movups $inout1,-0x20($out) ++ mov $key_,$key # restore $key ++ movups $inout2,-0x10($out) ++ ja .Lcbc_dec_loop3 ++ ++.Lcbc_dec_tail: ++ add \$0x40,$len ++ movups $iv,($ivp) ++ jz .Lcbc_dec_ret ++ ++ movups ($inp),$inout0 ++ cmp \$0x10,$len ++ movaps $inout0,$in0 ++ jbe .Lcbc_dec_one ++ movups 0x10($inp),$inout1 ++ cmp \$0x20,$len ++ movaps $inout1,$in1 ++ jbe .Lcbc_dec_two ++ movups 0x20($inp),$inout2 ++ cmp \$0x30,$len ++ movaps $inout2,$in2 ++ jbe .Lcbc_dec_three ++ movups 0x30($inp),$inout3 ++ call _aesni_decrypt4 ++ pxor $iv,$inout0 ++ movups 0x30($inp),$iv ++ pxor $in0,$inout1 ++ movups $inout0,($out) ++ pxor $in1,$inout2 ++ movups $inout1,0x10($out) ++ pxor $in2,$inout3 ++ movups $inout2,0x20($out) ++ movaps $inout3,$inout0 ++ lea 0x30($out),$out ++ jmp .Lcbc_dec_tail_collected ++.align 16 ++.Lcbc_dec_one: ++___ ++ &aesni_generate1("dec",$key,$rounds); ++$code.=<<___; ++ pxor $iv,$inout0 ++ movaps $in0,$iv ++ jmp .Lcbc_dec_tail_collected ++.align 16 ++.Lcbc_dec_two: ++ call _aesni_decrypt3 ++ pxor $iv,$inout0 ++ pxor $in0,$inout1 ++ movups $inout0,($out) ++ movaps $in1,$iv ++ movaps $inout1,$inout0 ++ lea 0x10($out),$out ++ jmp .Lcbc_dec_tail_collected ++.align 16 ++.Lcbc_dec_three: ++ call _aesni_decrypt3 ++ pxor $iv,$inout0 ++ pxor $in0,$inout1 ++ movups $inout0,($out) ++ pxor $in1,$inout2 ++ movups $inout1,0x10($out) ++ movaps $in2,$iv ++ movaps $inout2,$inout0 ++ lea 0x20($out),$out ++ jmp .Lcbc_dec_tail_collected ++.align 16 ++.Lcbc_dec_tail_collected: ++ and \$15,$len ++ movups $iv,($ivp) ++ jnz .Lcbc_dec_tail_partial ++ movups $inout0,($out) ++ jmp .Lcbc_dec_ret ++.Lcbc_dec_tail_partial: ++ movaps $inout0,$reserved(%rsp) ++ mov $out,%rdi ++ mov $len,%rcx ++ lea $reserved(%rsp),%rsi ++ .long 0x9066A4F3 # rep movsb ++ ++.Lcbc_dec_ret: ++___ ++$code.=<<___ if ($win64); ++ movaps (%rsp),%xmm6 ++ movaps 0x10(%rsp),%xmm7 ++ movaps 0x20(%rsp),%xmm8 ++ movaps 0x30(%rsp),%xmm9 ++ lea 0x58(%rsp),%rsp ++___ ++$code.=<<___; ++.Lcbc_ret: ++ ret ++.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt ++___ ++ ++# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, ++# int bits, AES_KEY *key) ++{ my ($inp,$bits,$key) = @_4args; ++ $bits =~ s/%r/%e/; ++ ++$code.=<<___; ++.globl ${PREFIX}_set_decrypt_key ++.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent ++.align 16 ++${PREFIX}_set_decrypt_key: ++ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 ++ call _aesni_set_encrypt_key ++ shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key ++ test %eax,%eax ++ jnz .Ldec_key_ret ++ lea 16($key,$bits),$inp # points at the end of key schedule ++ ++ $movkey ($key),%xmm0 # just swap ++ $movkey ($inp),%xmm1 ++ $movkey %xmm0,($inp) ++ $movkey %xmm1,($key) ++ lea 16($key),$key ++ lea -16($inp),$inp ++ ++.Ldec_key_inverse: ++ $movkey ($key),%xmm0 # swap and inverse ++ $movkey ($inp),%xmm1 ++ aesimc %xmm0,%xmm0 ++ aesimc %xmm1,%xmm1 ++ lea 16($key),$key ++ lea -16($inp),$inp ++ cmp $key,$inp ++ $movkey %xmm0,16($inp) ++ $movkey %xmm1,-16($key) ++ ja .Ldec_key_inverse ++ ++ $movkey ($key),%xmm0 # inverse middle ++ aesimc %xmm0,%xmm0 ++ $movkey %xmm0,($inp) ++.Ldec_key_ret: ++ add \$8,%rsp ++ ret ++.LSEH_end_set_decrypt_key: ++.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key ++___ ++ ++# This is based on submission by ++# ++# Huang Ying ++# Vinodh Gopal ++# Kahraman Akdemir ++# ++# Agressively optimized in respect to aeskeygenassist's critical path ++# and is contained in %xmm0-5 to meet Win64 ABI requirement. ++# ++$code.=<<___; ++.globl ${PREFIX}_set_encrypt_key ++.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent ++.align 16 ++${PREFIX}_set_encrypt_key: ++_aesni_set_encrypt_key: ++ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 ++ test $inp,$inp ++ mov \$-1,%rax ++ jz .Lenc_key_ret ++ test $key,$key ++ jz .Lenc_key_ret ++ ++ movups ($inp),%xmm0 # pull first 128 bits of *userKey ++ pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0 ++ lea 16($key),%rax ++ cmp \$256,$bits ++ je .L14rounds ++ cmp \$192,$bits ++ je .L12rounds ++ cmp \$128,$bits ++ jne .Lbad_keybits ++ ++.L10rounds: ++ mov \$9,$bits # 10 rounds for 128-bit key ++ $movkey %xmm0,($key) # round 0 ++ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 ++ call .Lkey_expansion_128_cold ++ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 ++ call .Lkey_expansion_128 ++ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 ++ call .Lkey_expansion_128 ++ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 ++ call .Lkey_expansion_128 ++ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 ++ call .Lkey_expansion_128 ++ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 ++ call .Lkey_expansion_128 ++ aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 ++ call .Lkey_expansion_128 ++ aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 ++ call .Lkey_expansion_128 ++ aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 ++ call .Lkey_expansion_128 ++ aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 ++ call .Lkey_expansion_128 ++ $movkey %xmm0,(%rax) ++ mov $bits,80(%rax) # 240(%rdx) ++ xor %eax,%eax ++ jmp .Lenc_key_ret ++ ++.align 16 ++.L12rounds: ++ movq 16($inp),%xmm2 # remaining 1/3 of *userKey ++ mov \$11,$bits # 12 rounds for 192 ++ $movkey %xmm0,($key) # round 0 ++ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 ++ call .Lkey_expansion_192a_cold ++ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 ++ call .Lkey_expansion_192b ++ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 ++ call .Lkey_expansion_192a ++ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 ++ call .Lkey_expansion_192b ++ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 ++ call .Lkey_expansion_192a ++ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 ++ call .Lkey_expansion_192b ++ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 ++ call .Lkey_expansion_192a ++ aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 ++ call .Lkey_expansion_192b ++ $movkey %xmm0,(%rax) ++ mov $bits,48(%rax) # 240(%rdx) ++ xor %rax, %rax ++ jmp .Lenc_key_ret ++ ++.align 16 ++.L14rounds: ++ movups 16($inp),%xmm2 # remaning half of *userKey ++ mov \$13,$bits # 14 rounds for 256 ++ lea 16(%rax),%rax ++ $movkey %xmm0,($key) # round 0 ++ $movkey %xmm2,16($key) # round 1 ++ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 ++ call .Lkey_expansion_256a_cold ++ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 ++ call .Lkey_expansion_256b ++ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 ++ call .Lkey_expansion_256a ++ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 ++ call .Lkey_expansion_256b ++ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 ++ call .Lkey_expansion_256a ++ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 ++ call .Lkey_expansion_256b ++ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 ++ call .Lkey_expansion_256a ++ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 ++ call .Lkey_expansion_256b ++ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 ++ call .Lkey_expansion_256a ++ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 ++ call .Lkey_expansion_256b ++ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 ++ call .Lkey_expansion_256a ++ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 ++ call .Lkey_expansion_256b ++ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 ++ call .Lkey_expansion_256a ++ $movkey %xmm0,(%rax) ++ mov $bits,16(%rax) # 240(%rdx) ++ xor %rax,%rax ++ jmp .Lenc_key_ret ++ ++.align 16 ++.Lbad_keybits: ++ mov \$-2,%rax ++.Lenc_key_ret: ++ add \$8,%rsp ++ ret ++.LSEH_end_set_encrypt_key: ++ ++.align 16 ++.Lkey_expansion_128: ++ $movkey %xmm0,(%rax) ++ lea 16(%rax),%rax ++.Lkey_expansion_128_cold: ++ shufps \$0b00010000,%xmm0,%xmm4 ++ pxor %xmm4, %xmm0 ++ shufps \$0b10001100,%xmm0,%xmm4 ++ pxor %xmm4, %xmm0 ++ pshufd \$0b11111111,%xmm1,%xmm1 # critical path ++ pxor %xmm1,%xmm0 ++ ret ++ ++.align 16 ++.Lkey_expansion_192a: ++ $movkey %xmm0,(%rax) ++ lea 16(%rax),%rax ++.Lkey_expansion_192a_cold: ++ movaps %xmm2, %xmm5 ++.Lkey_expansion_192b_warm: ++ shufps \$0b00010000,%xmm0,%xmm4 ++ movaps %xmm2,%xmm3 ++ pxor %xmm4,%xmm0 ++ shufps \$0b10001100,%xmm0,%xmm4 ++ pslldq \$4,%xmm3 ++ pxor %xmm4,%xmm0 ++ pshufd \$0b01010101,%xmm1,%xmm1 # critical path ++ pxor %xmm3,%xmm2 ++ pxor %xmm1,%xmm0 ++ pshufd \$0b11111111,%xmm0,%xmm3 ++ pxor %xmm3,%xmm2 ++ ret ++ ++.align 16 ++.Lkey_expansion_192b: ++ movaps %xmm0,%xmm3 ++ shufps \$0b01000100,%xmm0,%xmm5 ++ $movkey %xmm5,(%rax) ++ shufps \$0b01001110,%xmm2,%xmm3 ++ $movkey %xmm3,16(%rax) ++ lea 32(%rax),%rax ++ jmp .Lkey_expansion_192b_warm ++ ++.align 16 ++.Lkey_expansion_256a: ++ $movkey %xmm2,(%rax) ++ lea 16(%rax),%rax ++.Lkey_expansion_256a_cold: ++ shufps \$0b00010000,%xmm0,%xmm4 ++ pxor %xmm4,%xmm0 ++ shufps \$0b10001100,%xmm0,%xmm4 ++ pxor %xmm4,%xmm0 ++ pshufd \$0b11111111,%xmm1,%xmm1 # critical path ++ pxor %xmm1,%xmm0 ++ ret ++ ++.align 16 ++.Lkey_expansion_256b: ++ $movkey %xmm0,(%rax) ++ lea 16(%rax),%rax ++ ++ shufps \$0b00010000,%xmm2,%xmm4 ++ pxor %xmm4,%xmm2 ++ shufps \$0b10001100,%xmm2,%xmm4 ++ pxor %xmm4,%xmm2 ++ pshufd \$0b10101010,%xmm1,%xmm1 # critical path ++ pxor %xmm1,%xmm2 ++ ret ++.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key ++___ ++} ++ ++$code.=<<___; ++.asciz "AES for Intel AES-NI, CRYPTOGAMS by " ++.align 64 ++___ ++ ++# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, ++# CONTEXT *context,DISPATCHER_CONTEXT *disp) ++if ($win64) { ++$rec="%rcx"; ++$frame="%rdx"; ++$context="%r8"; ++$disp="%r9"; ++ ++$code.=<<___; ++.extern __imp_RtlVirtualUnwind ++.type cbc_se_handler,\@abi-omnipotent ++.align 16 ++cbc_se_handler: ++ push %rsi ++ push %rdi ++ push %rbx ++ push %rbp ++ push %r12 ++ push %r13 ++ push %r14 ++ push %r15 ++ pushfq ++ sub \$64,%rsp ++ ++ mov 152($context),%rax # pull context->Rsp ++ mov 248($context),%rbx # pull context->Rip ++ ++ lea .Lcbc_decrypt(%rip),%r10 ++ cmp %r10,%rbx # context->Rip<"prologue" label ++ jb .Lin_prologue ++ ++ lea .Lcbc_decrypt_body(%rip),%r10 ++ cmp %r10,%rbx # context->RipRip>="epilogue" label ++ jae .Lin_prologue ++ ++ lea 0(%rax),%rsi # top of stack ++ lea 512($context),%rdi # &context.Xmm6 ++ mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) ++ .long 0xa548f3fc # cld; rep movsq ++ lea 0x58(%rax),%rax # adjust stack pointer ++ jmp .Lin_prologue ++ ++.Lrestore_rax: ++ mov 120($context),%rax ++.Lin_prologue: ++ mov 8(%rax),%rdi ++ mov 16(%rax),%rsi ++ mov %rax,152($context) # restore context->Rsp ++ mov %rsi,168($context) # restore context->Rsi ++ mov %rdi,176($context) # restore context->Rdi ++ ++ jmp .Lcommon_seh_exit ++.size cbc_se_handler,.-cbc_se_handler ++ ++.type ecb_se_handler,\@abi-omnipotent ++.align 16 ++ecb_se_handler: ++ push %rsi ++ push %rdi ++ push %rbx ++ push %rbp ++ push %r12 ++ push %r13 ++ push %r14 ++ push %r15 ++ pushfq ++ sub \$64,%rsp ++ ++ mov 152($context),%rax # pull context->Rsp ++ mov 8(%rax),%rdi ++ mov 16(%rax),%rsi ++ mov %rsi,168($context) # restore context->Rsi ++ mov %rdi,176($context) # restore context->Rdi ++ ++.Lcommon_seh_exit: ++ ++ mov 40($disp),%rdi # disp->ContextRecord ++ mov $context,%rsi # context ++ mov \$154,%ecx # sizeof(CONTEXT) ++ .long 0xa548f3fc # cld; rep movsq ++ ++ mov $disp,%rsi ++ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER ++ mov 8(%rsi),%rdx # arg2, disp->ImageBase ++ mov 0(%rsi),%r8 # arg3, disp->ControlPc ++ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry ++ mov 40(%rsi),%r10 # disp->ContextRecord ++ lea 56(%rsi),%r11 # &disp->HandlerData ++ lea 24(%rsi),%r12 # &disp->EstablisherFrame ++ mov %r10,32(%rsp) # arg5 ++ mov %r11,40(%rsp) # arg6 ++ mov %r12,48(%rsp) # arg7 ++ mov %rcx,56(%rsp) # arg8, (NULL) ++ call *__imp_RtlVirtualUnwind(%rip) ++ ++ mov \$1,%eax # ExceptionContinueSearch ++ add \$64,%rsp ++ popfq ++ pop %r15 ++ pop %r14 ++ pop %r13 ++ pop %r12 ++ pop %rbp ++ pop %rbx ++ pop %rdi ++ pop %rsi ++ ret ++.size cbc_se_handler,.-cbc_se_handler ++ ++.section .pdata ++.align 4 ++ .rva .LSEH_begin_${PREFIX}_ecb_encrypt ++ .rva .LSEH_end_${PREFIX}_ecb_encrypt ++ .rva .LSEH_info_ecb ++ ++ .rva .LSEH_begin_${PREFIX}_cbc_encrypt ++ .rva .LSEH_end_${PREFIX}_cbc_encrypt ++ .rva .LSEH_info_cbc ++ ++ .rva ${PREFIX}_set_decrypt_key ++ .rva .LSEH_end_set_decrypt_key ++ .rva .LSEH_info_key ++ ++ .rva ${PREFIX}_set_encrypt_key ++ .rva .LSEH_end_set_encrypt_key ++ .rva .LSEH_info_key ++.section .xdata ++.align 8 ++.LSEH_info_ecb: ++ .byte 9,0,0,0 ++ .rva ecb_se_handler ++.LSEH_info_cbc: ++ .byte 9,0,0,0 ++ .rva cbc_se_handler ++.LSEH_info_key: ++ .byte 0x01,0x04,0x01,0x00 ++ .byte 0x04,0x02,0x00,0x00 ++___ ++} ++ ++sub rex { ++ local *opcode=shift; ++ my ($dst,$src)=@_; ++ ++ if ($dst>=8 || $src>=8) { ++ $rex=0x40; ++ $rex|=0x04 if($dst>=8); ++ $rex|=0x01 if($src>=8); ++ push @opcode,$rex; ++ } ++} ++ ++sub aesni { ++ my $line=shift; ++ my @opcode=(0x66); ++ ++ if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { ++ rex(\@opcode,$4,$3); ++ push @opcode,0x0f,0x3a,0xdf; ++ push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M ++ my $c=$2; ++ push @opcode,$c=~/^0/?oct($c):$c; ++ return ".byte\t".join(',',@opcode); ++ } ++ elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { ++ my %opcodelet = ( ++ "aesimc" => 0xdb, ++ "aesenc" => 0xdc, "aesenclast" => 0xdd, ++ "aesdec" => 0xde, "aesdeclast" => 0xdf ++ ); ++ return undef if (!defined($opcodelet{$1})); ++ rex(\@opcode,$3,$2); ++ push @opcode,0x0f,0x38,$opcodelet{$1}; ++ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M ++ return ".byte\t".join(',',@opcode); ++ } ++ return $line; ++} ++ ++$code =~ s/\`([^\`]*)\`/eval($1)/gem; ++$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; ++ ++print $code; ++ ++close STDOUT; +diff -up openssl-1.0.0-beta4/crypto/aes/Makefile.aesni openssl-1.0.0-beta4/crypto/aes/Makefile +--- openssl-1.0.0-beta4/crypto/aes/Makefile.aesni 2008-12-23 12:33:00.000000000 +0100 ++++ openssl-1.0.0-beta4/crypto/aes/Makefile 2010-01-12 22:18:06.000000000 +0100 +@@ -50,9 +50,13 @@ aes-ia64.s: asm/aes-ia64.S + + aes-586.s: asm/aes-586.pl ../perlasm/x86asm.pl + $(PERL) asm/aes-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ ++aesni-x86.s: asm/aesni-x86.pl ../perlasm/x86asm.pl ++ $(PERL) asm/aesni-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ + + aes-x86_64.s: asm/aes-x86_64.pl + $(PERL) asm/aes-x86_64.pl $(PERLASM_SCHEME) > $@ ++aesni-x86_64.s: asm/aesni-x86_64.pl ++ $(PERL) asm/aesni-x86_64.pl $(PERLASM_SCHEME) > $@ + + aes-sparcv9.s: asm/aes-sparcv9.pl + $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@ +diff -up openssl-1.0.0-beta4/crypto/engine/eng_aesni.c.aesni openssl-1.0.0-beta4/crypto/engine/eng_aesni.c +--- openssl-1.0.0-beta4/crypto/engine/eng_aesni.c.aesni 2010-01-12 22:18:06.000000000 +0100 ++++ openssl-1.0.0-beta4/crypto/engine/eng_aesni.c 2010-01-12 22:18:06.000000000 +0100 +@@ -0,0 +1,413 @@ ++/* ++ * Support for Intel AES-NI intruction set ++ * Author: Huang Ying ++ * ++ * Intel AES-NI is a new set of Single Instruction Multiple Data ++ * (SIMD) instructions that are going to be introduced in the next ++ * generation of Intel processor, as of 2009. These instructions ++ * enable fast and secure data encryption and decryption, using the ++ * Advanced Encryption Standard (AES), defined by FIPS Publication ++ * number 197. The architecture introduces six instructions that ++ * offer full hardware support for AES. Four of them support high ++ * performance data encryption and decryption, and the other two ++ * instructions support the AES key expansion procedure. ++ * ++ * The white paper can be downloaded from: ++ * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf ++ * ++ * This file is based on engines/e_padlock.c ++ */ ++ ++/* ==================================================================== ++ * Copyright (c) 1999-2001 The OpenSSL Project. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * ++ * 3. All advertising materials mentioning features or use of this ++ * software must display the following acknowledgment: ++ * "This product includes software developed by the OpenSSL Project ++ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" ++ * ++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to ++ * endorse or promote products derived from this software without ++ * prior written permission. For written permission, please contact ++ * licensing@OpenSSL.org. ++ * ++ * 5. Products derived from this software may not be called "OpenSSL" ++ * nor may "OpenSSL" appear in their names without prior written ++ * permission of the OpenSSL Project. ++ * ++ * 6. Redistributions of any form whatsoever must retain the following ++ * acknowledgment: ++ * "This product includes software developed by the OpenSSL Project ++ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY ++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR ++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT ++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, ++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED ++ * OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ==================================================================== ++ * ++ * This product includes cryptographic software written by Eric Young ++ * (eay@cryptsoft.com). This product includes software written by Tim ++ * Hudson (tjh@cryptsoft.com). ++ * ++ */ ++ ++ ++#include ++ ++#if !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AES_NI) && !defined(OPENSSL_NO_AES) ++ ++#include ++#include "cryptlib.h" ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* AES-NI is available *ONLY* on some x86 CPUs. Not only that it ++ doesn't exist elsewhere, but it even can't be compiled on other ++ platforms! */ ++#undef COMPILE_HW_AESNI ++#if (defined(__x86_64) || defined(__x86_64__) || \ ++ defined(_M_AMD64) || defined(_M_X64) || \ ++ defined(OPENSSL_IA32_SSE2)) && !defined(OPENSSL_NO_ASM) ++#define COMPILE_HW_AESNI ++static ENGINE *ENGINE_aesni (void); ++#endif ++ ++void ENGINE_load_aesni (void) ++{ ++/* On non-x86 CPUs it just returns. */ ++#ifdef COMPILE_HW_AESNI ++ ENGINE *toadd = ENGINE_aesni(); ++ if (!toadd) ++ return; ++ ENGINE_add (toadd); ++ ENGINE_register_complete (toadd); ++ ENGINE_free (toadd); ++ ERR_clear_error (); ++#endif ++} ++ ++#ifdef COMPILE_HW_AESNI ++int aesni_set_encrypt_key(const unsigned char *userKey, int bits, ++ AES_KEY *key); ++int aesni_set_decrypt_key(const unsigned char *userKey, int bits, ++ AES_KEY *key); ++ ++void aesni_encrypt(const unsigned char *in, unsigned char *out, ++ const AES_KEY *key); ++void aesni_decrypt(const unsigned char *in, unsigned char *out, ++ const AES_KEY *key); ++ ++void aesni_ecb_encrypt(const unsigned char *in, ++ unsigned char *out, ++ size_t length, ++ const AES_KEY *key, ++ int enc); ++void aesni_cbc_encrypt(const unsigned char *in, ++ unsigned char *out, ++ size_t length, ++ const AES_KEY *key, ++ unsigned char *ivec, int enc); ++ ++/* Function for ENGINE detection and control */ ++static int aesni_init(ENGINE *e); ++ ++/* Cipher Stuff */ ++static int aesni_ciphers(ENGINE *e, const EVP_CIPHER **cipher, ++ const int **nids, int nid); ++ ++#define AESNI_MIN_ALIGN 16 ++#define AESNI_ALIGN(x) \ ++ ((void *)(((unsigned long)(x)+AESNI_MIN_ALIGN-1)&~(AESNI_MIN_ALIGN-1))) ++ ++/* Engine names */ ++static const char aesni_id[] = "aesni", ++ aesni_name[] = "Intel AES-NI engine", ++ no_aesni_name[] = "Intel AES-NI engine (no-aesni)"; ++ ++/* ===== Engine "management" functions ===== */ ++ ++#if defined(_WIN32) ++typedef unsigned __int64 IA32CAP; ++#else ++typedef unsigned long long IA32CAP; ++#endif ++ ++/* Prepare the ENGINE structure for registration */ ++static int ++aesni_bind_helper(ENGINE *e) ++{ ++ int engage; ++ if (sizeof(OPENSSL_ia32cap_P) > 4) { ++ engage = (OPENSSL_ia32cap_P >> 57) & 1; ++ } else { ++ IA32CAP OPENSSL_ia32_cpuid(void); ++ engage = (OPENSSL_ia32_cpuid() >> 57) & 1; ++ } ++ ++ /* Register everything or return with an error */ ++ if (!ENGINE_set_id(e, aesni_id) || ++ !ENGINE_set_name(e, engage ? aesni_name : no_aesni_name) || ++ ++ !ENGINE_set_init_function(e, aesni_init) || ++ (engage && !ENGINE_set_ciphers (e, aesni_ciphers)) ++ ) ++ return 0; ++ ++ /* Everything looks good */ ++ return 1; ++} ++ ++/* Constructor */ ++static ENGINE * ++ENGINE_aesni(void) ++{ ++ ENGINE *eng = ENGINE_new(); ++ ++ if (!eng) { ++ return NULL; ++ } ++ ++ if (!aesni_bind_helper(eng)) { ++ ENGINE_free(eng); ++ return NULL; ++ } ++ ++ return eng; ++} ++ ++/* Check availability of the engine */ ++static int ++aesni_init(ENGINE *e) ++{ ++ return 1; ++} ++ ++#if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb) ++#define NID_aes_128_cfb NID_aes_128_cfb128 ++#endif ++ ++#if defined(NID_aes_128_ofb128) && ! defined (NID_aes_128_ofb) ++#define NID_aes_128_ofb NID_aes_128_ofb128 ++#endif ++ ++#if defined(NID_aes_192_cfb128) && ! defined (NID_aes_192_cfb) ++#define NID_aes_192_cfb NID_aes_192_cfb128 ++#endif ++ ++#if defined(NID_aes_192_ofb128) && ! defined (NID_aes_192_ofb) ++#define NID_aes_192_ofb NID_aes_192_ofb128 ++#endif ++ ++#if defined(NID_aes_256_cfb128) && ! defined (NID_aes_256_cfb) ++#define NID_aes_256_cfb NID_aes_256_cfb128 ++#endif ++ ++#if defined(NID_aes_256_ofb128) && ! defined (NID_aes_256_ofb) ++#define NID_aes_256_ofb NID_aes_256_ofb128 ++#endif ++ ++/* List of supported ciphers. */ ++static int aesni_cipher_nids[] = { ++ NID_aes_128_ecb, ++ NID_aes_128_cbc, ++ NID_aes_128_cfb, ++ NID_aes_128_ofb, ++ ++ NID_aes_192_ecb, ++ NID_aes_192_cbc, ++ NID_aes_192_cfb, ++ NID_aes_192_ofb, ++ ++ NID_aes_256_ecb, ++ NID_aes_256_cbc, ++ NID_aes_256_cfb, ++ NID_aes_256_ofb, ++}; ++static int aesni_cipher_nids_num = ++ (sizeof(aesni_cipher_nids)/sizeof(aesni_cipher_nids[0])); ++ ++typedef struct ++{ ++ AES_KEY ks; ++ unsigned int _pad1[3]; ++} AESNI_KEY; ++ ++static int ++aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *user_key, ++ const unsigned char *iv, int enc) ++{ ++ int ret; ++ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); ++ ++ if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE ++ || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE ++ || enc) ++ ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key); ++ else ++ ret=aesni_set_decrypt_key(user_key, ctx->key_len * 8, key); ++ ++ if(ret < 0) { ++ EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED); ++ return 0; ++ } ++ ++ return 1; ++} ++ ++static int aesni_cipher_ecb(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t inl) ++{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); ++ aesni_ecb_encrypt(in, out, inl, key, ctx->encrypt); ++ return 1; ++} ++static int aesni_cipher_cbc(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t inl) ++{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); ++ aesni_cbc_encrypt(in, out, inl, key, ++ ctx->iv, ctx->encrypt); ++ return 1; ++} ++static int aesni_cipher_cfb(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t inl) ++{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); ++ CRYPTO_cfb128_encrypt(in, out, inl, key, ctx->iv, ++ &ctx->num, ctx->encrypt, ++ (block128_f)aesni_encrypt); ++ return 1; ++} ++static int aesni_cipher_ofb(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t inl) ++{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); ++ CRYPTO_ofb128_encrypt(in, out, inl, key, ctx->iv, ++ &ctx->num, (block128_f)aesni_encrypt); ++ return 1; ++} ++ ++#define AES_BLOCK_SIZE 16 ++ ++#define EVP_CIPHER_block_size_ECB AES_BLOCK_SIZE ++#define EVP_CIPHER_block_size_CBC AES_BLOCK_SIZE ++#define EVP_CIPHER_block_size_OFB 1 ++#define EVP_CIPHER_block_size_CFB 1 ++ ++/* Declaring so many ciphers by hand would be a pain. ++ Instead introduce a bit of preprocessor magic :-) */ ++#define DECLARE_AES_EVP(ksize,lmode,umode) \ ++static const EVP_CIPHER aesni_##ksize##_##lmode = { \ ++ NID_aes_##ksize##_##lmode, \ ++ EVP_CIPHER_block_size_##umode, \ ++ ksize / 8, \ ++ AES_BLOCK_SIZE, \ ++ 0 | EVP_CIPH_##umode##_MODE, \ ++ aesni_init_key, \ ++ aesni_cipher_##lmode, \ ++ NULL, \ ++ sizeof(AESNI_KEY), \ ++ EVP_CIPHER_set_asn1_iv, \ ++ EVP_CIPHER_get_asn1_iv, \ ++ NULL, \ ++ NULL \ ++} ++ ++DECLARE_AES_EVP(128,ecb,ECB); ++DECLARE_AES_EVP(128,cbc,CBC); ++DECLARE_AES_EVP(128,cfb,CFB); ++DECLARE_AES_EVP(128,ofb,OFB); ++ ++DECLARE_AES_EVP(192,ecb,ECB); ++DECLARE_AES_EVP(192,cbc,CBC); ++DECLARE_AES_EVP(192,cfb,CFB); ++DECLARE_AES_EVP(192,ofb,OFB); ++ ++DECLARE_AES_EVP(256,ecb,ECB); ++DECLARE_AES_EVP(256,cbc,CBC); ++DECLARE_AES_EVP(256,cfb,CFB); ++DECLARE_AES_EVP(256,ofb,OFB); ++ ++static int ++aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, ++ const int **nids, int nid) ++{ ++ /* No specific cipher => return a list of supported nids ... */ ++ if (!cipher) { ++ *nids = aesni_cipher_nids; ++ return aesni_cipher_nids_num; ++ } ++ ++ /* ... or the requested "cipher" otherwise */ ++ switch (nid) { ++ case NID_aes_128_ecb: ++ *cipher = &aesni_128_ecb; ++ break; ++ case NID_aes_128_cbc: ++ *cipher = &aesni_128_cbc; ++ break; ++ case NID_aes_128_cfb: ++ *cipher = &aesni_128_cfb; ++ break; ++ case NID_aes_128_ofb: ++ *cipher = &aesni_128_ofb; ++ break; ++ ++ case NID_aes_192_ecb: ++ *cipher = &aesni_192_ecb; ++ break; ++ case NID_aes_192_cbc: ++ *cipher = &aesni_192_cbc; ++ break; ++ case NID_aes_192_cfb: ++ *cipher = &aesni_192_cfb; ++ break; ++ case NID_aes_192_ofb: ++ *cipher = &aesni_192_ofb; ++ break; ++ ++ case NID_aes_256_ecb: ++ *cipher = &aesni_256_ecb; ++ break; ++ case NID_aes_256_cbc: ++ *cipher = &aesni_256_cbc; ++ break; ++ case NID_aes_256_cfb: ++ *cipher = &aesni_256_cfb; ++ break; ++ case NID_aes_256_ofb: ++ *cipher = &aesni_256_ofb; ++ break; ++ ++ default: ++ /* Sorry, we don't support this NID */ ++ *cipher = NULL; ++ return 0; ++ } ++ ++ return 1; ++} ++ ++#endif /* COMPILE_HW_AESNI */ ++#endif /* !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AESNI) && !defined(OPENSSL_NO_AES) */ +diff -up openssl-1.0.0-beta4/crypto/engine/eng_all.c.aesni openssl-1.0.0-beta4/crypto/engine/eng_all.c +--- openssl-1.0.0-beta4/crypto/engine/eng_all.c.aesni 2010-01-07 23:38:31.000000000 +0100 ++++ openssl-1.0.0-beta4/crypto/engine/eng_all.c 2010-01-12 22:18:06.000000000 +0100 +@@ -85,6 +85,9 @@ void ENGINE_load_builtin_engines(void) + #if !defined(OPENSSL_NO_HW) && (defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV)) + ENGINE_load_cryptodev(); + #endif ++#if !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AESNI) ++ ENGINE_load_aesni(); ++#endif + ENGINE_load_dynamic(); + #ifndef OPENSSL_NO_STATIC_ENGINE + #ifndef OPENSSL_NO_HW +diff -up openssl-1.0.0-beta4/crypto/engine/engine.h.aesni openssl-1.0.0-beta4/crypto/engine/engine.h +--- openssl-1.0.0-beta4/crypto/engine/engine.h.aesni 2010-01-07 23:38:30.000000000 +0100 ++++ openssl-1.0.0-beta4/crypto/engine/engine.h 2010-01-12 22:18:06.000000000 +0100 +@@ -342,6 +342,7 @@ void ENGINE_load_gost(void); + #endif + #endif + void ENGINE_load_cryptodev(void); ++void ENGINE_load_aesni(void); + void ENGINE_load_builtin_engines(void); + + /* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation +diff -up openssl-1.0.0-beta4/crypto/engine/Makefile.aesni openssl-1.0.0-beta4/crypto/engine/Makefile +--- openssl-1.0.0-beta4/crypto/engine/Makefile.aesni 2008-06-04 13:01:29.000000000 +0200 ++++ openssl-1.0.0-beta4/crypto/engine/Makefile 2010-01-12 22:18:06.000000000 +0100 +@@ -21,12 +21,14 @@ LIBSRC= eng_err.c eng_lib.c eng_list.c e + eng_table.c eng_pkey.c eng_fat.c eng_all.c \ + tb_rsa.c tb_dsa.c tb_ecdsa.c tb_dh.c tb_ecdh.c tb_rand.c tb_store.c \ + tb_cipher.c tb_digest.c tb_pkmeth.c tb_asnmth.c \ +- eng_openssl.c eng_cnf.c eng_dyn.c eng_cryptodev.c ++ eng_openssl.c eng_cnf.c eng_dyn.c eng_cryptodev.c \ ++ eng_aesni.c + LIBOBJ= eng_err.o eng_lib.o eng_list.o eng_init.o eng_ctrl.o \ + eng_table.o eng_pkey.o eng_fat.o eng_all.o \ + tb_rsa.o tb_dsa.o tb_ecdsa.o tb_dh.o tb_ecdh.o tb_rand.o tb_store.o \ + tb_cipher.o tb_digest.o tb_pkmeth.o tb_asnmth.o \ +- eng_openssl.o eng_cnf.o eng_dyn.o eng_cryptodev.o ++ eng_openssl.o eng_cnf.o eng_dyn.o eng_cryptodev.o \ ++ eng_aesni.o + + SRC= $(LIBSRC) + +diff -up openssl-1.0.0-beta4/crypto/evp/evp_err.c.aesni openssl-1.0.0-beta4/crypto/evp/evp_err.c +--- openssl-1.0.0-beta4/crypto/evp/evp_err.c.aesni 2010-01-07 23:38:31.000000000 +0100 ++++ openssl-1.0.0-beta4/crypto/evp/evp_err.c 2010-01-12 22:18:06.000000000 +0100 +@@ -1,6 +1,6 @@ + /* crypto/evp/evp_err.c */ + /* ==================================================================== +- * Copyright (c) 1999-2008 The OpenSSL Project. All rights reserved. ++ * Copyright (c) 1999-2009 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions +@@ -70,6 +70,7 @@ + + static ERR_STRING_DATA EVP_str_functs[]= + { ++{ERR_FUNC(EVP_F_AESNI_INIT_KEY), "AESNI_INIT_KEY"}, + {ERR_FUNC(EVP_F_AES_INIT_KEY), "AES_INIT_KEY"}, + {ERR_FUNC(EVP_F_CAMELLIA_INIT_KEY), "CAMELLIA_INIT_KEY"}, + {ERR_FUNC(EVP_F_D2I_PKEY), "D2I_PKEY"}, +@@ -85,7 +86,7 @@ static ERR_STRING_DATA EVP_str_functs[]= + {ERR_FUNC(EVP_F_EVP_DIGESTINIT_EX), "EVP_DigestInit_ex"}, + {ERR_FUNC(EVP_F_EVP_ENCRYPTFINAL_EX), "EVP_EncryptFinal_ex"}, + {ERR_FUNC(EVP_F_EVP_MD_CTX_COPY_EX), "EVP_MD_CTX_copy_ex"}, +-{ERR_FUNC(EVP_F_EVP_MD_SIZE), "EVP_MD_SIZE"}, ++{ERR_FUNC(EVP_F_EVP_MD_SIZE), "EVP_MD_size"}, + {ERR_FUNC(EVP_F_EVP_OPENINIT), "EVP_OpenInit"}, + {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD), "EVP_PBE_alg_add"}, + {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD_TYPE), "EVP_PBE_alg_add_type"}, +diff -up openssl-1.0.0-beta4/crypto/evp/evp.h.aesni openssl-1.0.0-beta4/crypto/evp/evp.h +--- openssl-1.0.0-beta4/crypto/evp/evp.h.aesni 2010-01-07 23:38:31.000000000 +0100 ++++ openssl-1.0.0-beta4/crypto/evp/evp.h 2010-01-12 22:18:06.000000000 +0100 +@@ -1162,6 +1162,7 @@ void ERR_load_EVP_strings(void); + /* Error codes for the EVP functions. */ + + /* Function codes. */ ++#define EVP_F_AESNI_INIT_KEY 163 + #define EVP_F_AES_INIT_KEY 133 + #define EVP_F_CAMELLIA_INIT_KEY 159 + #define EVP_F_D2I_PKEY 100 +diff -up openssl-1.0.0-beta4/test/test_aesni.aesni openssl-1.0.0-beta4/test/test_aesni +--- openssl-1.0.0-beta4/test/test_aesni.aesni 2010-01-12 22:18:06.000000000 +0100 ++++ openssl-1.0.0-beta4/test/test_aesni 2010-01-12 22:18:06.000000000 +0100 +@@ -0,0 +1,69 @@ ++#!/bin/sh ++ ++PROG=$1 ++ ++if [ -x $PROG ]; then ++ if expr "x`$PROG version`" : "xOpenSSL" > /dev/null; then ++ : ++ else ++ echo "$PROG is not OpenSSL executable" ++ exit 1 ++ fi ++else ++ echo "$PROG is not executable" ++ exit 1; ++fi ++ ++if $PROG engine aesni | grep -v no-aesni; then ++ ++ HASH=`cat $PROG | $PROG dgst -hex` ++ ++ AES_ALGS=" aes-128-ecb aes-192-ecb aes-256-ecb \ ++ aes-128-cbc aes-192-cbc aes-256-cbc \ ++ aes-128-cfb aes-192-cfb aes-256-cfb \ ++ aes-128-ofb aes-192-ofb aes-256-ofb" ++ BUFSIZE="16 32 48 64 80 96 128 144 999" ++ ++ nerr=0 ++ ++ for alg in $AES_ALGS; do ++ echo $alg ++ for bufsize in $BUFSIZE; do ++ TEST=`( cat $PROG | \ ++ $PROG enc -e -k "$HASH" -$alg -bufsize $bufsize -engine aesni | \ ++ $PROG enc -d -k "$HASH" -$alg | \ ++ $PROG dgst -hex ) 2>/dev/null` ++ if [ "$TEST" != "$HASH" ]; then ++ echo "-$alg/$bufsize encrypt test failed" ++ nerr=`expr $nerr + 1` ++ fi ++ done ++ for bufsize in $BUFSIZE; do ++ TEST=`( cat $PROG | \ ++ $PROG enc -e -k "$HASH" -$alg | \ ++ $PROG enc -d -k "$HASH" -$alg -bufsize $bufsize -engine aesni | \ ++ $PROG dgst -hex ) 2>/dev/null` ++ if [ "$TEST" != "$HASH" ]; then ++ echo "-$alg/$bufsize decrypt test failed" ++ nerr=`expr $nerr + 1` ++ fi ++ done ++ TEST=`( cat $PROG | \ ++ $PROG enc -e -k "$HASH" -$alg -engine aesni | \ ++ $PROG enc -d -k "$HASH" -$alg -engine aesni | \ ++ $PROG dgst -hex ) 2>/dev/null` ++ if [ "$TEST" != "$HASH" ]; then ++ echo "-$alg en/decrypt test failed" ++ nerr=`expr $nerr + 1` ++ fi ++ done ++ ++ if [ $nerr -gt 0 ]; then ++ echo "AESNI engine test failed." ++ exit 1; ++ fi ++else ++ echo "AESNI engine is not available" ++fi ++ ++exit 0 diff --git a/openssl.spec b/openssl.spec index 49af8f2..2724b5e 100644 --- a/openssl.spec +++ b/openssl.spec @@ -23,7 +23,7 @@ Summary: A general purpose cryptography library with TLS implementation Name: openssl Version: 1.0.0 -Release: 0.17.%{beta}%{?dist} +Release: 0.18.%{beta}%{?dist} # We remove certain patented algorithms from the openssl source tarball # with the hobble-openssl script which is included below. Source: openssl-%{version}-%{beta}-usa.tar.bz2 @@ -72,6 +72,7 @@ Patch65: openssl-1.0.0-beta4-dtls-reneg.patch Patch66: openssl-1.0.0-beta4-backports2.patch Patch67: openssl-1.0.0-beta4-reneg-scsv.patch Patch68: openssl-1.0.0-beta4-tls-comp.patch +Patch69: openssl-1.0.0-beta4-aesni.patch License: OpenSSL Group: System Environment/Libraries @@ -160,6 +161,7 @@ from other formats to the formats used by the OpenSSL toolkit. %patch66 -p1 -b .backports2 %patch67 -p1 -b .scsv %patch68 -p1 -b .tls-comp +%patch69 -p1 -b .aesni # Modify the various perl scripts to reference perl in the right location. perl util/perlpath.pl `dirname %{__perl}` @@ -408,10 +410,13 @@ rm -rf $RPM_BUILD_ROOT/%{_libdir}/fipscanister.* %postun -p /sbin/ldconfig %changelog +* Wed Jan 13 2010 Tomas Mraz 1.0.0-0.18.beta4 +- add support for Intel AES-NI + * Thu Jan 7 2010 Tomas Mraz 1.0.0-0.17.beta4 - upstream fix compression handling on session resumption - various null checks and other small fixes from upstream -- upstream changes for the renegotiation info according to the latest draft +- upstream changes for the renegotiation info according to the latest draft * Mon Nov 23 2009 Tomas Mraz 1.0.0-0.16.beta4 - fix non-fips mingw build (patch by Kalev Lember)