M7350v1_en_gpl

This commit is contained in:
T
2024-09-09 08:52:07 +00:00
commit f9cc65cfda
65988 changed files with 26357421 additions and 0 deletions
@@ -0,0 +1,270 @@
#!/usr/bin/env perl
# ====================================================================
# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# At some point it became apparent that the original SSLeay RC4
# assembler implementation performs suboptimally on latest IA-32
# microarchitectures. After re-tuning performance has changed as
# following:
#
# Pentium -10%
# Pentium III +12%
# AMD +50%(*)
# P4 +250%(**)
#
# (*) This number is actually a trade-off:-) It's possible to
# achieve +72%, but at the cost of -48% off PIII performance.
# In other words code performing further 13% faster on AMD
# would perform almost 2 times slower on Intel PIII...
# For reference! This code delivers ~80% of rc4-amd64.pl
# performance on the same Opteron machine.
# (**) This number requires compressed key schedule set up by
# RC4_set_key [see commentary below for further details].
#
# <appro@fy.chalmers.se>
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"rc4-586.pl");
$xx="eax";
$yy="ebx";
$tx="ecx";
$ty="edx";
$inp="esi";
$out="ebp";
$dat="edi";
sub RC4_loop {
my $i=shift;
my $func = ($i==0)?*mov:*or;
&add (&LB($yy),&LB($tx));
&mov ($ty,&DWP(0,$dat,$yy,4));
&mov (&DWP(0,$dat,$yy,4),$tx);
&mov (&DWP(0,$dat,$xx,4),$ty);
&add ($ty,$tx);
&inc (&LB($xx));
&and ($ty,0xff);
&ror ($out,8) if ($i!=0);
if ($i<3) {
&mov ($tx,&DWP(0,$dat,$xx,4));
} else {
&mov ($tx,&wparam(3)); # reload [re-biased] out
}
&$func ($out,&DWP(0,$dat,$ty,4));
}
# void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
&function_begin("RC4");
&mov ($dat,&wparam(0)); # load key schedule pointer
&mov ($ty, &wparam(1)); # load len
&mov ($inp,&wparam(2)); # load inp
&mov ($out,&wparam(3)); # load out
&xor ($xx,$xx); # avoid partial register stalls
&xor ($yy,$yy);
&cmp ($ty,0); # safety net
&je (&label("abort"));
&mov (&LB($xx),&BP(0,$dat)); # load key->x
&mov (&LB($yy),&BP(4,$dat)); # load key->y
&add ($dat,8);
&lea ($tx,&DWP(0,$inp,$ty));
&sub ($out,$inp); # re-bias out
&mov (&wparam(1),$tx); # save input+len
&inc (&LB($xx));
# detect compressed key schedule...
&cmp (&DWP(256,$dat),-1);
&je (&label("RC4_CHAR"));
&mov ($tx,&DWP(0,$dat,$xx,4));
&and ($ty,-4); # how many 4-byte chunks?
&jz (&label("loop1"));
&lea ($ty,&DWP(-4,$inp,$ty));
&mov (&wparam(2),$ty); # save input+(len/4)*4-4
&mov (&wparam(3),$out); # $out as accumulator in this loop
&set_label("loop4",16);
for ($i=0;$i<4;$i++) { RC4_loop($i); }
&ror ($out,8);
&xor ($out,&DWP(0,$inp));
&cmp ($inp,&wparam(2)); # compare to input+(len/4)*4-4
&mov (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here
&lea ($inp,&DWP(4,$inp));
&mov ($tx,&DWP(0,$dat,$xx,4));
&jb (&label("loop4"));
&cmp ($inp,&wparam(1)); # compare to input+len
&je (&label("done"));
&mov ($out,&wparam(3)); # restore $out
&set_label("loop1",16);
&add (&LB($yy),&LB($tx));
&mov ($ty,&DWP(0,$dat,$yy,4));
&mov (&DWP(0,$dat,$yy,4),$tx);
&mov (&DWP(0,$dat,$xx,4),$ty);
&add ($ty,$tx);
&inc (&LB($xx));
&and ($ty,0xff);
&mov ($ty,&DWP(0,$dat,$ty,4));
&xor (&LB($ty),&BP(0,$inp));
&lea ($inp,&DWP(1,$inp));
&mov ($tx,&DWP(0,$dat,$xx,4));
&cmp ($inp,&wparam(1)); # compare to input+len
&mov (&BP(-1,$out,$inp),&LB($ty));
&jb (&label("loop1"));
&jmp (&label("done"));
# this is essentially Intel P4 specific codepath...
&set_label("RC4_CHAR",16);
&movz ($tx,&BP(0,$dat,$xx));
# strangely enough unrolled loop performs over 20% slower...
&set_label("cloop1");
&add (&LB($yy),&LB($tx));
&movz ($ty,&BP(0,$dat,$yy));
&mov (&BP(0,$dat,$yy),&LB($tx));
&mov (&BP(0,$dat,$xx),&LB($ty));
&add (&LB($ty),&LB($tx));
&movz ($ty,&BP(0,$dat,$ty));
&add (&LB($xx),1);
&xor (&LB($ty),&BP(0,$inp));
&lea ($inp,&DWP(1,$inp));
&movz ($tx,&BP(0,$dat,$xx));
&cmp ($inp,&wparam(1));
&mov (&BP(-1,$out,$inp),&LB($ty));
&jb (&label("cloop1"));
&set_label("done");
&dec (&LB($xx));
&mov (&BP(-4,$dat),&LB($yy)); # save key->y
&mov (&BP(-8,$dat),&LB($xx)); # save key->x
&set_label("abort");
&function_end("RC4");
########################################################################
$inp="esi";
$out="edi";
$idi="ebp";
$ido="ecx";
$idx="edx";
&external_label("OPENSSL_ia32cap_P");
# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
&function_begin("RC4_set_key");
&mov ($out,&wparam(0)); # load key
&mov ($idi,&wparam(1)); # load len
&mov ($inp,&wparam(2)); # load data
&picmeup($idx,"OPENSSL_ia32cap_P");
&lea ($out,&DWP(2*4,$out)); # &key->data
&lea ($inp,&DWP(0,$inp,$idi)); # $inp to point at the end
&neg ($idi);
&xor ("eax","eax");
&mov (&DWP(-4,$out),$idi); # borrow key->y
&bt (&DWP(0,$idx),20); # check for bit#20
&jc (&label("c1stloop"));
&set_label("w1stloop",16);
&mov (&DWP(0,$out,"eax",4),"eax"); # key->data[i]=i;
&add (&LB("eax"),1); # i++;
&jnc (&label("w1stloop"));
&xor ($ido,$ido);
&xor ($idx,$idx);
&set_label("w2ndloop",16);
&mov ("eax",&DWP(0,$out,$ido,4));
&add (&LB($idx),&BP(0,$inp,$idi));
&add (&LB($idx),&LB("eax"));
&add ($idi,1);
&mov ("ebx",&DWP(0,$out,$idx,4));
&jnz (&label("wnowrap"));
&mov ($idi,&DWP(-4,$out));
&set_label("wnowrap");
&mov (&DWP(0,$out,$idx,4),"eax");
&mov (&DWP(0,$out,$ido,4),"ebx");
&add (&LB($ido),1);
&jnc (&label("w2ndloop"));
&jmp (&label("exit"));
# Unlike all other x86 [and x86_64] implementations, Intel P4 core
# [including EM64T] was found to perform poorly with above "32-bit" key
# schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded
# assembler turned out to be 3.5x if re-coded for compressed 8-bit one,
# a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit
# schedule for x86[_64], because non-P4 implementations suffer from
# significant performance losses then, e.g. PIII exhibits >2x
# deterioration, and so does Opteron. In order to assure optimal
# all-round performance, we detect P4 at run-time and set up compressed
# key schedule, which is recognized by RC4 procedure.
&set_label("c1stloop",16);
&mov (&BP(0,$out,"eax"),&LB("eax")); # key->data[i]=i;
&add (&LB("eax"),1); # i++;
&jnc (&label("c1stloop"));
&xor ($ido,$ido);
&xor ($idx,$idx);
&xor ("ebx","ebx");
&set_label("c2ndloop",16);
&mov (&LB("eax"),&BP(0,$out,$ido));
&add (&LB($idx),&BP(0,$inp,$idi));
&add (&LB($idx),&LB("eax"));
&add ($idi,1);
&mov (&LB("ebx"),&BP(0,$out,$idx));
&jnz (&label("cnowrap"));
&mov ($idi,&DWP(-4,$out));
&set_label("cnowrap");
&mov (&BP(0,$out,$idx),&LB("eax"));
&mov (&BP(0,$out,$ido),&LB("ebx"));
&add (&LB($ido),1);
&jnc (&label("c2ndloop"));
&mov (&DWP(256,$out),-1); # mark schedule as compressed
&set_label("exit");
&xor ("eax","eax");
&mov (&DWP(-8,$out),"eax"); # key->x=0;
&mov (&DWP(-4,$out),"eax"); # key->y=0;
&function_end("RC4_set_key");
# const char *RC4_options(void);
&function_begin_B("RC4_options");
&call (&label("pic_point"));
&set_label("pic_point");
&blindpop("eax");
&lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
&picmeup("edx","OPENSSL_ia32cap_P");
&bt (&DWP(0,"edx"),20);
&jnc (&label("skip"));
&add ("eax",12);
&set_label("skip");
&ret ();
&set_label("opts",64);
&asciz ("rc4(4x,int)");
&asciz ("rc4(1x,char)");
&asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
&align (64);
&function_end_B("RC4_options");
&asm_finish();
@@ -0,0 +1,755 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by David Mosberger <David.Mosberger@acm.org> based on the
# Itanium optimized Crypto code which was released by HP Labs at
# http://www.hpl.hp.com/research/linux/crypto/.
#
# Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
# This is a little helper program which generates a software-pipelined
# for RC4 encryption. The basic algorithm looks like this:
#
# for (counter = 0; counter < len; ++counter)
# {
# in = inp[counter];
# SI = S[I];
# J = (SI + J) & 0xff;
# SJ = S[J];
# T = (SI + SJ) & 0xff;
# S[I] = SJ, S[J] = SI;
# ST = S[T];
# outp[counter] = in ^ ST;
# I = (I + 1) & 0xff;
# }
#
# Pipelining this loop isn't easy, because the stores to the S[] array
# need to be observed in the right order. The loop generated by the
# code below has the following pipeline diagram:
#
# cycle
# | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
# iter
# 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
# 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
# 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
#
# where:
# LDI = load of S[I]
# LDJ = load of S[J]
# SWP = swap of S[I] and S[J]
# LDT = load of S[T]
#
# Note that in the above diagram, the major trouble-spot is that LDI
# of the 2nd iteration is performed BEFORE the SWP of the first
# iteration. Fortunately, this is easy to detect (I of the 1st
# iteration will be equal to J of the 2nd iteration) and when this
# happens, we simply forward the proper value from the 1st iteration
# to the 2nd one. The proper value in this case is simply the value
# of S[I] from the first iteration (thanks to the fact that SWP
# simply swaps the contents of S[I] and S[J]).
#
# Another potential trouble-spot is in cycle 7, where SWP of the 1st
# iteration issues at the same time as the LDI of the 3rd iteration.
# However, thanks to IA-64 execution semantics, this can be taken
# care of simply by placing LDI later in the instruction-group than
# SWP. IA-64 CPUs will automatically forward the value if they
# detect that the SWP and LDI are accessing the same memory-location.
# The core-loop that can be pipelined then looks like this (annotated
# with McKinley/Madison issue port & latency numbers, assuming L1
# cache hits for the most part):
# operation: instruction: issue-ports: latency
# ------------------ ----------------------------- ------------- -------
# Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0
# shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc
# I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc
# ;;
# SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP!
# ;;
# cmp.eq.unc pBypass = I, J * after J is valid!
# J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2
# (pBypass) br.cond.spnt Bypass
# ;;
# ---------------------------------------------------------------------------------------
# J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3
# ;;
# shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4
# ;;
# SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5
# ;;
# ---------------------------------------------------------------------------------------
# T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6
# ;;
# T = T & 0xff zxt1 T = T I0, I1 1 cyc
# S[I] = SJ st8 [Iptr] = SJ M2-M3 c7
# S[J] = SI st8 [Jptr] = SI M2-M3
# ;;
# shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8
# ;;
# ---------------------------------------------------------------------------------------
# T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9
# ;;
# data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c10
# ;;
# *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c11
# ;;
# ---------------------------------------------------------------------------------------
# There are several points worth making here:
# - Note that due to the bypass/forwarding-path, the first two
# phases of the loop are strangly mingled together. In
# particular, note that the first stage of the pipeline is
# using the value of "J", as calculated by the second stage.
# - Each bundle-pair will have exactly 6 instructions.
# - Pipelined, the loop can execute in 3 cycles/iteration and
# 4 stages. However, McKinley/Madison can issue "st1" to
# the same bank at a rate of at most one per 4 cycles. Thus,
# instead of storing each byte, we accumulate them in a word
# and then write them back at once with a single "st8" (this
# implies that the setup code needs to ensure that the output
# buffer is properly aligned, if need be, by encoding the
# first few bytes separately).
# - There is no space for a "br.ctop" instruction. For this
# reason we can't use module-loop support in IA-64 and have
# to do a traditional, purely software-pipelined loop.
# - We can't replace any of the remaining "add/zxt1" pairs with
# "padd1" because the latency for that instruction is too high
# and would push the loop to the point where more bypasses
# would be needed, which we don't have space for.
# - The above loop runs at around 3.26 cycles/byte, or roughly
# 440 MByte/sec on a 1.5GHz Madison. This is well below the
# system bus bandwidth and hence with judicious use of
# "lfetch" this loop can run at (almost) peak speed even when
# the input and output data reside in memory. The
# max. latency that can be tolerated is (PREFETCH_DISTANCE *
# L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
# least) 1-ahead prefetching of 128 byte cache-lines. Note
# that we do NOT prefetch into L1, since that would only
# interfere with the S[] table values stored there. This is
# acceptable because there is a 10 cycle latency between
# load and first use of the input data.
# - We use a branch to out-of-line bypass-code of cycle-pressure:
# we calculate the next J, check for the need to activate the
# bypass path, and activate the bypass path ALL IN THE SAME
# CYCLE. If we didn't have these constraints, we could do
# the bypass with a simple conditional move instruction.
# Fortunately, the bypass paths get activated relatively
# infrequently, so the extra branches don't cost all that much
# (about 0.04 cycles/byte, measured on a 16396 byte file with
# random input data).
#
$phases = 4; # number of stages/phases in the pipelined-loop
$unroll_count = 6; # number of times we unrolled it
$pComI = (1 << 0);
$pComJ = (1 << 1);
$pComT = (1 << 2);
$pOut = (1 << 3);
$NData = 4;
$NIP = 3;
$NJP = 2;
$NI = 2;
$NSI = 3;
$NSJ = 2;
$NT = 2;
$NOutWord = 2;
#
# $threshold is the minimum length before we attempt to use the
# big software-pipelined loop. It MUST be greater-or-equal
# to:
# PHASES * (UNROLL_COUNT + 1) + 7
#
# The "+ 7" comes from the fact we may have to encode up to
# 7 bytes separately before the output pointer is aligned.
#
$threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
sub I {
local *code = shift;
local $format = shift;
$code .= sprintf ("\t\t".$format."\n", @_);
}
sub P {
local *code = shift;
local $format = shift;
$code .= sprintf ($format."\n", @_);
}
sub STOP {
local *code = shift;
$code .=<<___;
;;
___
}
sub emit_body {
local *c = shift;
local *bypass = shift;
local ($iteration, $p) = @_;
local $i0 = $iteration;
local $i1 = $iteration - 1;
local $i2 = $iteration - 2;
local $i3 = $iteration - 3;
local $iw0 = ($iteration - 3) / 8;
local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
local $byte_num = ($iteration - 3) % 8;
local $label = $iteration + 1;
local $pAny = ($p & 0xf) == 0xf;
local $pByp = (($p & $pComI) && ($iteration > 0));
$c.=<<___;
//////////////////////////////////////////////////
___
if (($p & 0xf) == 0) {
$c.="#ifdef HOST_IS_BIG_ENDIAN\n";
&I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;",
$iw1 % $NOutWord, $iw1 % $NOutWord);
$c.="#endif\n";
&I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
return;
}
# Cycle 0
&I(\$c, "{ .mmi") if ($pAny);
&I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI);
&I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
&I(\$c, "zxt1 J = J") if ($p & $pComJ);
&I(\$c, "}") if ($pAny);
&I(\$c, "{ .mmi") if ($pAny);
&I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut);
&I(\$c, "add T[%u] = SI[%u], SJ[%u]",
$i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT);
&I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
&I(\$c, "}") if ($pAny);
&STOP(\$c);
# Cycle 1
&I(\$c, "{ .mmi") if ($pAny);
&I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
&I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
&I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
&I(\$c, "}") if ($pAny);
&I(\$c, "{ .mmi") if ($pAny);
&I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
&I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ);
&I(\$c, "xor Data[%u] = Data[%u], T[%u]",
$i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut);
&I(\$c, "}") if ($pAny);
&STOP(\$c);
# Cycle 2
&I(\$c, "{ .mmi") if ($pAny);
&I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
&I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp);
&I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
$iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
&I(\$c, "}") if ($pAny);
&I(\$c, "{ .mmb") if ($pAny);
&I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI);
&I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
&P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
&I(\$c, "}") if ($pAny);
&STOP(\$c);
&P(\$c, ".rc4Resume%u:", $label) if ($pByp);
if ($byte_num == 0 && $iteration >= $phases) {
&I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
$iw1 % $NOutWord) if ($p & $pOut);
if ($iteration == (1 + $unroll_count) * $phases - 1) {
if ($unroll_count == 6) {
&I(\$c, "mov OutWord[%u] = OutWord[%u]",
$iw1 % $NOutWord, $iw0 % $NOutWord);
}
&I(\$c, "lfetch.nt1 [InPrefetch], %u",
$unroll_count * $phases);
&I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
$unroll_count * $phases);
&I(\$c, "br.cloop.sptk.few .rc4Loop");
}
}
if ($pByp) {
&P(\$bypass, ".rc4Bypass%u:", $label);
&I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
&I(\$bypass, "nop 0");
&I(\$bypass, "nop 0");
&I(\$bypass, ";;");
&I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
&I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
&I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
&I(\$bypass, ";;");
}
}
$code=<<___;
.ident \"rc4-ia64.s, version 3.0\"
.ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
#define LCSave r8
#define PRSave r9
/* Inputs become invalid once rotation begins! */
#define StateTable in0
#define DataLen in1
#define InputBuffer in2
#define OutputBuffer in3
#define KTable r14
#define J r15
#define InPtr r16
#define OutPtr r17
#define InPrefetch r18
#define OutPrefetch r19
#define One r20
#define LoopCount r21
#define Remainder r22
#define IFinal r23
#define EndPtr r24
#define tmp0 r25
#define tmp1 r26
#define pBypass p6
#define pDone p7
#define pSmall p8
#define pAligned p9
#define pUnaligned p10
#define pComputeI pPhase[0]
#define pComputeJ pPhase[1]
#define pComputeT pPhase[2]
#define pOutput pPhase[3]
#define RetVal r8
#define L_OK p7
#define L_NOK p8
#define _NINPUTS 4
#define _NOUTPUT 0
#define _NROTATE 24
#define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT)
#ifndef SZ
# define SZ 4 // this must be set to sizeof(RC4_INT)
#endif
#if SZ == 1
# define LKEY ld1
# define SKEY st1
# define KEYADDR(dst, i) add dst = i, KTable
#elif SZ == 2
# define LKEY ld2
# define SKEY st2
# define KEYADDR(dst, i) shladd dst = i, 1, KTable
#elif SZ == 4
# define LKEY ld4
# define SKEY st4
# define KEYADDR(dst, i) shladd dst = i, 2, KTable
#else
# define LKEY ld8
# define SKEY st8
# define KEYADDR(dst, i) shladd dst = i, 3, KTable
#endif
#if defined(_HPUX_SOURCE) && !defined(_LP64)
# define ADDP addp4
#else
# define ADDP add
#endif
/* Define a macro for the bit number of the n-th byte: */
#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
# define HOST_IS_BIG_ENDIAN
# define BYTE_POS(n) (56 - (8 * (n)))
#else
# define BYTE_POS(n) (8 * (n))
#endif
/*
We must perform the first phase of the pipeline explicitly since
we will always load from the stable the first time. The br.cexit
will never be taken since regardless of the number of bytes because
the epilogue count is 4.
*/
/* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
assembler failed on original macro with syntax error. <appro> */
#define MODSCHED_RC4_PROLOGUE \\
{ \\
ld1 Data[0] = [InPtr], 1; \\
add IFinal = 1, I[1]; \\
KEYADDR(IPr[0], I[1]); \\
} ;; \\
{ \\
LKEY SI[0] = [IPr[0]]; \\
mov pr.rot = 0x10000; \\
mov ar.ec = 4; \\
} ;; \\
{ \\
add J = J, SI[0]; \\
zxt1 I[0] = IFinal; \\
br.cexit.spnt.few .+16; /* never taken */ \\
} ;;
#define MODSCHED_RC4_LOOP(label) \\
label: \\
{ .mmi; \\
(pComputeI) ld1 Data[0] = [InPtr], 1; \\
(pComputeI) add IFinal = 1, I[1]; \\
(pComputeJ) zxt1 J = J; \\
}{ .mmi; \\
(pOutput) LKEY T[1] = [T[1]]; \\
(pComputeT) add T[0] = SI[2], SJ[1]; \\
(pComputeI) KEYADDR(IPr[0], I[1]); \\
} ;; \\
{ .mmi; \\
(pComputeT) SKEY [IPr[2]] = SJ[1]; \\
(pComputeT) SKEY [JP[1]] = SI[2]; \\
(pComputeT) zxt1 T[0] = T[0]; \\
}{ .mmi; \\
(pComputeI) LKEY SI[0] = [IPr[0]]; \\
(pComputeJ) KEYADDR(JP[0], J); \\
(pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\
} ;; \\
{ .mmi; \\
(pComputeJ) LKEY SJ[0] = [JP[0]]; \\
(pOutput) xor Data[3] = Data[3], T[1]; \\
nop 0x0; \\
}{ .mmi; \\
(pComputeT) KEYADDR(T[0], T[0]); \\
(pBypass) mov SI[0] = SI[1]; \\
(pComputeI) zxt1 I[0] = IFinal; \\
} ;; \\
{ .mmb; \\
(pOutput) st1 [OutPtr] = Data[3], 1; \\
(pComputeI) add J = J, SI[0]; \\
br.ctop.sptk.few label; \\
} ;;
.text
.align 32
.type RC4, \@function
.global RC4
.proc RC4
.prologue
RC4:
{
.mmi
alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
.rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
OutWord[2]
.rotp pPhase[4]
ADDP InPrefetch = 0, InputBuffer
ADDP KTable = 0, StateTable
}
{
.mmi
ADDP InPtr = 0, InputBuffer
ADDP OutPtr = 0, OutputBuffer
mov RetVal = r0
}
;;
{
.mmi
lfetch.nt1 [InPrefetch], 0x80
ADDP OutPrefetch = 0, OutputBuffer
}
{ // Return 0 if the input length is nonsensical
.mib
ADDP StateTable = 0, StateTable
cmp.ge.unc L_NOK, L_OK = r0, DataLen
(L_NOK) br.ret.sptk.few rp
}
;;
{
.mib
cmp.eq.or L_NOK, L_OK = r0, InPtr
cmp.eq.or L_NOK, L_OK = r0, OutPtr
nop 0x0
}
{
.mib
cmp.eq.or L_NOK, L_OK = r0, StateTable
nop 0x0
(L_NOK) br.ret.sptk.few rp
}
;;
LKEY I[1] = [KTable], SZ
/* Prefetch the state-table. It contains 256 elements of size SZ */
#if SZ == 1
ADDP tmp0 = 1*128, StateTable
#elif SZ == 2
ADDP tmp0 = 3*128, StateTable
ADDP tmp1 = 2*128, StateTable
#elif SZ == 4
ADDP tmp0 = 7*128, StateTable
ADDP tmp1 = 6*128, StateTable
#elif SZ == 8
ADDP tmp0 = 15*128, StateTable
ADDP tmp1 = 14*128, StateTable
#endif
;;
#if SZ >= 8
lfetch.fault.nt1 [tmp0], -256 // 15
lfetch.fault.nt1 [tmp1], -256;;
lfetch.fault.nt1 [tmp0], -256 // 13
lfetch.fault.nt1 [tmp1], -256;;
lfetch.fault.nt1 [tmp0], -256 // 11
lfetch.fault.nt1 [tmp1], -256;;
lfetch.fault.nt1 [tmp0], -256 // 9
lfetch.fault.nt1 [tmp1], -256;;
#endif
#if SZ >= 4
lfetch.fault.nt1 [tmp0], -256 // 7
lfetch.fault.nt1 [tmp1], -256;;
lfetch.fault.nt1 [tmp0], -256 // 5
lfetch.fault.nt1 [tmp1], -256;;
#endif
#if SZ >= 2
lfetch.fault.nt1 [tmp0], -256 // 3
lfetch.fault.nt1 [tmp1], -256;;
#endif
{
.mii
lfetch.fault.nt1 [tmp0] // 1
add I[1]=1,I[1];;
zxt1 I[1]=I[1]
}
{
.mmi
lfetch.nt1 [InPrefetch], 0x80
lfetch.excl.nt1 [OutPrefetch], 0x80
.save pr, PRSave
mov PRSave = pr
} ;;
{
.mmi
lfetch.excl.nt1 [OutPrefetch], 0x80
LKEY J = [KTable], SZ
ADDP EndPtr = DataLen, InPtr
} ;;
{
.mmi
ADDP EndPtr = -1, EndPtr // Make it point to
// last data byte.
mov One = 1
.save ar.lc, LCSave
mov LCSave = ar.lc
.body
} ;;
{
.mmb
sub Remainder = 0, OutPtr
cmp.gtu pSmall, p0 = $threshold, DataLen
(pSmall) br.cond.dpnt .rc4Remainder // Data too small for
// big loop.
} ;;
{
.mmi
and Remainder = 0x7, Remainder
;;
cmp.eq pAligned, pUnaligned = Remainder, r0
nop 0x0
} ;;
{
.mmb
.pred.rel "mutex",pUnaligned,pAligned
(pUnaligned) add Remainder = -1, Remainder
(pAligned) sub Remainder = EndPtr, InPtr
(pAligned) br.cond.dptk.many .rc4Aligned
} ;;
{
.mmi
nop 0x0
nop 0x0
mov.i ar.lc = Remainder
}
/* Do the initial few bytes via the compact, modulo-scheduled loop
until the output pointer is 8-byte-aligned. */
MODSCHED_RC4_PROLOGUE
MODSCHED_RC4_LOOP(.RC4AlignLoop)
{
.mib
sub Remainder = EndPtr, InPtr
zxt1 IFinal = IFinal
clrrrb // Clear CFM.rrb.pr so
;; // next "mov pr.rot = N"
// does the right thing.
}
{
.mmi
mov I[1] = IFinal
nop 0x0
nop 0x0
} ;;
.rc4Aligned:
/*
Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
*/
{
.mlx
add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
movl Remainder = 0xaaaaaaaaaaaaaaab
} ;;
{
.mmi
setf.sig f6 = LoopCount // M2, M3 6 cyc
setf.sig f7 = Remainder // M2, M3 6 cyc
nop 0x0
} ;;
{
.mfb
nop 0x0
xmpy.hu f6 = f6, f7
nop 0x0
} ;;
{
.mmi
getf.sig LoopCount = f6;; // M2 5 cyc
nop 0x0
shr.u LoopCount = LoopCount, 4
} ;;
{
.mmi
nop 0x0
nop 0x0
mov.i ar.lc = LoopCount
} ;;
/* Now comes the unrolled loop: */
.rc4Prologue:
___
$iteration = 0;
# Generate the prologue:
$predicates = 1;
for ($i = 0; $i < $phases; ++$i) {
&emit_body (\$code, \$bypass, $iteration++, $predicates);
$predicates = ($predicates << 1) | 1;
}
$code.=<<___;
.rc4Loop:
___
# Generate the body:
for ($i = 0; $i < $unroll_count*$phases; ++$i) {
&emit_body (\$code, \$bypass, $iteration++, $predicates);
}
$code.=<<___;
.rc4Epilogue:
___
# Generate the epilogue:
for ($i = 0; $i < $phases; ++$i) {
$predicates <<= 1;
&emit_body (\$code, \$bypass, $iteration++, $predicates);
}
$code.=<<___;
{
.mmi
lfetch.nt1 [EndPtr] // fetch line with last byte
mov IFinal = I[1]
nop 0x0
}
.rc4Remainder:
{
.mmi
sub Remainder = EndPtr, InPtr // Calculate
// # of bytes
// left - 1
nop 0x0
nop 0x0
} ;;
{
.mib
cmp.eq pDone, p0 = -1, Remainder // done already?
mov.i ar.lc = Remainder
(pDone) br.cond.dptk.few .rc4Complete
}
/* Do the remaining bytes via the compact, modulo-scheduled loop */
MODSCHED_RC4_PROLOGUE
MODSCHED_RC4_LOOP(.RC4RestLoop)
.rc4Complete:
{
.mmi
add KTable = -SZ, KTable
add IFinal = -1, IFinal
mov ar.lc = LCSave
} ;;
{
.mii
SKEY [KTable] = J,-SZ
zxt1 IFinal = IFinal
mov pr = PRSave, 0x1FFFF
} ;;
{
.mib
SKEY [KTable] = IFinal
add RetVal = 1, r0
br.ret.sptk.few rp
} ;;
___
# Last but not least, emit the code for the bypass-code of the unrolled loop:
$code.=$bypass;
$code.=<<___;
.endp RC4
___
print $code;
@@ -0,0 +1,205 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# February 2009
#
# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
# "cluster" Address Generation Interlocks, so that one pipeline stall
# resolves several dependencies.
$rp="%r14";
$sp="%r15";
$code=<<___;
.text
___
# void RC4(RC4_KEY *key,size_t len,const void *inp,void *out)
{
$acc="%r0";
$cnt="%r1";
$key="%r2";
$len="%r3";
$inp="%r4";
$out="%r5";
@XX=("%r6","%r7");
@TX=("%r8","%r9");
$YY="%r10";
$TY="%r11";
$code.=<<___;
.globl RC4
.type RC4,\@function
.align 64
RC4:
stmg %r6,%r11,48($sp)
llgc $XX[0],0($key)
llgc $YY,1($key)
la $XX[0],1($XX[0])
nill $XX[0],0xff
srlg $cnt,$len,3
ltgr $cnt,$cnt
llgc $TX[0],2($XX[0],$key)
jz .Lshort
j .Loop8
.align 64
.Loop8:
___
for ($i=0;$i<8;$i++) {
$code.=<<___;
la $YY,0($YY,$TX[0]) # $i
nill $YY,255
la $XX[1],1($XX[0])
nill $XX[1],255
___
$code.=<<___ if ($i==1);
llgc $acc,2($TY,$key)
___
$code.=<<___ if ($i>1);
sllg $acc,$acc,8
ic $acc,2($TY,$key)
___
$code.=<<___;
llgc $TY,2($YY,$key)
stc $TX[0],2($YY,$key)
llgc $TX[1],2($XX[1],$key)
stc $TY,2($XX[0],$key)
cr $XX[1],$YY
jne .Lcmov$i
la $TX[1],0($TX[0])
.Lcmov$i:
la $TY,0($TY,$TX[0])
nill $TY,255
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
$code.=<<___;
lg $TX[1],0($inp)
sllg $acc,$acc,8
la $inp,8($inp)
ic $acc,2($TY,$key)
xgr $acc,$TX[1]
stg $acc,0($out)
la $out,8($out)
brct $cnt,.Loop8
.Lshort:
lghi $acc,7
ngr $len,$acc
jz .Lexit
j .Loop1
.align 16
.Loop1:
la $YY,0($YY,$TX[0])
nill $YY,255
llgc $TY,2($YY,$key)
stc $TX[0],2($YY,$key)
stc $TY,2($XX[0],$key)
ar $TY,$TX[0]
ahi $XX[0],1
nill $TY,255
nill $XX[0],255
llgc $acc,0($inp)
la $inp,1($inp)
llgc $TY,2($TY,$key)
llgc $TX[0],2($XX[0],$key)
xr $acc,$TY
stc $acc,0($out)
la $out,1($out)
brct $len,.Loop1
.Lexit:
ahi $XX[0],-1
stc $XX[0],0($key)
stc $YY,1($key)
lmg %r6,%r11,48($sp)
br $rp
.size RC4,.-RC4
.string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
}
# void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp)
{
$cnt="%r0";
$idx="%r1";
$key="%r2";
$len="%r3";
$inp="%r4";
$acc="%r5";
$dat="%r6";
$ikey="%r7";
$iinp="%r8";
$code.=<<___;
.globl RC4_set_key
.type RC4_set_key,\@function
.align 64
RC4_set_key:
stmg %r6,%r8,48($sp)
lhi $cnt,256
la $idx,0(%r0)
sth $idx,0($key)
.align 4
.L1stloop:
stc $idx,2($idx,$key)
la $idx,1($idx)
brct $cnt,.L1stloop
lghi $ikey,-256
lr $cnt,$len
la $iinp,0(%r0)
la $idx,0(%r0)
.align 16
.L2ndloop:
llgc $acc,2+256($ikey,$key)
llgc $dat,0($iinp,$inp)
la $idx,0($idx,$acc)
la $ikey,1($ikey)
la $idx,0($idx,$dat)
nill $idx,255
la $iinp,1($iinp)
tml $ikey,255
llgc $dat,2($idx,$key)
stc $dat,2+256-1($ikey,$key)
stc $acc,2($idx,$key)
jz .Ldone
brct $cnt,.L2ndloop
lr $cnt,$len
la $iinp,0(%r0)
j .L2ndloop
.Ldone:
lmg %r6,%r8,48($sp)
br $rp
.size RC4_set_key,.-RC4_set_key
___
}
# const char *RC4_options()
$code.=<<___;
.globl RC4_options
.type RC4_options,\@function
.align 16
RC4_options:
larl %r2,.Loptions
br %r14
.size RC4_options,.-RC4_options
.section .rodata
.Loptions:
.align 8
.string "rc4(8x,char)"
___
print $code;
@@ -0,0 +1,504 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
# "hand-coded assembler"] doesn't stand for the whole improvement
# coefficient. It turned out that eliminating RC4_CHAR from config
# line results in ~40% improvement (yes, even for C implementation).
# Presumably it has everything to do with AMD cache architecture and
# RAW or whatever penalties. Once again! The module *requires* config
# line *without* RC4_CHAR! As for coding "secret," I bet on partial
# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
# I simply 'inc %r8b'. Even though optimization manual discourages
# to operate on partial registers, it turned out to be the best bet.
# At least for AMD... How IA32E would perform remains to be seen...
# As was shown by Marc Bevand reordering of couple of load operations
# results in even higher performance gain of 3.3x:-) At least on
# Opteron... For reference, 1x in this case is RC4_CHAR C-code
# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
# Latter means that if you want to *estimate* what to expect from
# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
# Intel P4 EM64T core was found to run the AMD64 code really slow...
# The only way to achieve comparable performance on P4 was to keep
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
# compose blended code, which would perform even within 30% marginal
# on either AMD and Intel platforms, I implement both cases. See
# rc4_skey.c for further details...
# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
# those with add/sub results in 50% performance improvement of folded
# loop...
# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
# performance by >30% [unlike P4 32-bit case that is]. But this is
# provided that loads are reordered even more aggressively! Both code
# pathes, AMD64 and EM64T, reorder loads in essentially same manner
# as my IA-64 implementation. On Opteron this resulted in modest 5%
# improvement [I had to test it], while final Intel P4 performance
# achieves respectful 432MBps on 2.8GHz processor now. For reference.
# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
# RC4_INT code-path. While if executed on Opteron, it's only 25%
# slower than the RC4_INT one [meaning that if CPU µ-arch detection
# is not implemented, then this final RC4_CHAR code-path should be
# preferred, as it provides better *all-round* performance].
# Intel Core2 was observed to perform poorly on both code paths:-( It
# apparently suffers from some kind of partial register stall, which
# occurs in 64-bit mode only [as virtually identical 32-bit loop was
# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
# cloop1 boosts its performance by 80%! This loop appears to be optimal
# fit for Core2 and therefore the code was modified to skip cloop8 on
# this CPU.
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| $^X $xlate $flavour $output";
$dat="%rdi"; # arg1
$len="%rsi"; # arg2
$inp="%rdx"; # arg3
$out="%rcx"; # arg4
@XX=("%r8","%r10");
@TX=("%r9","%r11");
$YY="%r12";
$TY="%r13";
$code=<<___;
.text
.globl RC4
.type RC4,\@function,4
.align 16
RC4: or $len,$len
jne .Lentry
ret
.Lentry:
push %rbx
push %r12
push %r13
.Lprologue:
add \$8,$dat
movl -8($dat),$XX[0]#d
movl -4($dat),$YY#d
cmpl \$-1,256($dat)
je .LRC4_CHAR
inc $XX[0]#b
movl ($dat,$XX[0],4),$TX[0]#d
test \$-8,$len
jz .Lloop1
jmp .Lloop8
.align 16
.Lloop8:
___
for ($i=0;$i<8;$i++) {
$code.=<<___;
add $TX[0]#b,$YY#b
mov $XX[0],$XX[1]
movl ($dat,$YY,4),$TY#d
ror \$8,%rax # ror is redundant when $i=0
inc $XX[1]#b
movl ($dat,$XX[1],4),$TX[1]#d
cmp $XX[1],$YY
movl $TX[0]#d,($dat,$YY,4)
cmove $TX[0],$TX[1]
movl $TY#d,($dat,$XX[0],4)
add $TX[0]#b,$TY#b
movb ($dat,$TY,4),%al
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
$code.=<<___;
ror \$8,%rax
sub \$8,$len
xor ($inp),%rax
add \$8,$inp
mov %rax,($out)
add \$8,$out
test \$-8,$len
jnz .Lloop8
cmp \$0,$len
jne .Lloop1
jmp .Lexit
.align 16
.Lloop1:
add $TX[0]#b,$YY#b
movl ($dat,$YY,4),$TY#d
movl $TX[0]#d,($dat,$YY,4)
movl $TY#d,($dat,$XX[0],4)
add $TY#b,$TX[0]#b
inc $XX[0]#b
movl ($dat,$TX[0],4),$TY#d
movl ($dat,$XX[0],4),$TX[0]#d
xorb ($inp),$TY#b
inc $inp
movb $TY#b,($out)
inc $out
dec $len
jnz .Lloop1
jmp .Lexit
.align 16
.LRC4_CHAR:
add \$1,$XX[0]#b
movzb ($dat,$XX[0]),$TX[0]#d
test \$-8,$len
jz .Lcloop1
cmpl \$0,260($dat)
jnz .Lcloop1
jmp .Lcloop8
.align 16
.Lcloop8:
mov ($inp),%eax
mov 4($inp),%ebx
___
# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
for ($i=0;$i<4;$i++) {
$code.=<<___;
add $TX[0]#b,$YY#b
lea 1($XX[0]),$XX[1]
movzb ($dat,$YY),$TY#d
movzb $XX[1]#b,$XX[1]#d
movzb ($dat,$XX[1]),$TX[1]#d
movb $TX[0]#b,($dat,$YY)
cmp $XX[1],$YY
movb $TY#b,($dat,$XX[0])
jne .Lcmov$i # Intel cmov is sloooow...
mov $TX[0],$TX[1]
.Lcmov$i:
add $TX[0]#b,$TY#b
xor ($dat,$TY),%al
ror \$8,%eax
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
for ($i=4;$i<8;$i++) {
$code.=<<___;
add $TX[0]#b,$YY#b
lea 1($XX[0]),$XX[1]
movzb ($dat,$YY),$TY#d
movzb $XX[1]#b,$XX[1]#d
movzb ($dat,$XX[1]),$TX[1]#d
movb $TX[0]#b,($dat,$YY)
cmp $XX[1],$YY
movb $TY#b,($dat,$XX[0])
jne .Lcmov$i # Intel cmov is sloooow...
mov $TX[0],$TX[1]
.Lcmov$i:
add $TX[0]#b,$TY#b
xor ($dat,$TY),%bl
ror \$8,%ebx
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
$code.=<<___;
lea -8($len),$len
mov %eax,($out)
lea 8($inp),$inp
mov %ebx,4($out)
lea 8($out),$out
test \$-8,$len
jnz .Lcloop8
cmp \$0,$len
jne .Lcloop1
jmp .Lexit
___
$code.=<<___;
.align 16
.Lcloop1:
add $TX[0]#b,$YY#b
movzb ($dat,$YY),$TY#d
movb $TX[0]#b,($dat,$YY)
movb $TY#b,($dat,$XX[0])
add $TX[0]#b,$TY#b
add \$1,$XX[0]#b
movzb $TY#b,$TY#d
movzb $XX[0]#b,$XX[0]#d
movzb ($dat,$TY),$TY#d
movzb ($dat,$XX[0]),$TX[0]#d
xorb ($inp),$TY#b
lea 1($inp),$inp
movb $TY#b,($out)
lea 1($out),$out
sub \$1,$len
jnz .Lcloop1
jmp .Lexit
.align 16
.Lexit:
sub \$1,$XX[0]#b
movl $XX[0]#d,-8($dat)
movl $YY#d,-4($dat)
mov (%rsp),%r13
mov 8(%rsp),%r12
mov 16(%rsp),%rbx
add \$24,%rsp
.Lepilogue:
ret
.size RC4,.-RC4
___
$idx="%r8";
$ido="%r9";
$code.=<<___;
.extern OPENSSL_ia32cap_P
.globl RC4_set_key
.type RC4_set_key,\@function,3
.align 16
RC4_set_key:
lea 8($dat),$dat
lea ($inp,$len),$inp
neg $len
mov $len,%rcx
xor %eax,%eax
xor $ido,$ido
xor %r10,%r10
xor %r11,%r11
mov OPENSSL_ia32cap_P(%rip),$idx#d
bt \$20,$idx#d
jnc .Lw1stloop
bt \$30,$idx#d
setc $ido#b
mov $ido#d,260($dat)
jmp .Lc1stloop
.align 16
.Lw1stloop:
mov %eax,($dat,%rax,4)
add \$1,%al
jnc .Lw1stloop
xor $ido,$ido
xor $idx,$idx
.align 16
.Lw2ndloop:
mov ($dat,$ido,4),%r10d
add ($inp,$len,1),$idx#b
add %r10b,$idx#b
add \$1,$len
mov ($dat,$idx,4),%r11d
cmovz %rcx,$len
mov %r10d,($dat,$idx,4)
mov %r11d,($dat,$ido,4)
add \$1,$ido#b
jnc .Lw2ndloop
jmp .Lexit_key
.align 16
.Lc1stloop:
mov %al,($dat,%rax)
add \$1,%al
jnc .Lc1stloop
xor $ido,$ido
xor $idx,$idx
.align 16
.Lc2ndloop:
mov ($dat,$ido),%r10b
add ($inp,$len),$idx#b
add %r10b,$idx#b
add \$1,$len
mov ($dat,$idx),%r11b
jnz .Lcnowrap
mov %rcx,$len
.Lcnowrap:
mov %r10b,($dat,$idx)
mov %r11b,($dat,$ido)
add \$1,$ido#b
jnc .Lc2ndloop
movl \$-1,256($dat)
.align 16
.Lexit_key:
xor %eax,%eax
mov %eax,-8($dat)
mov %eax,-4($dat)
ret
.size RC4_set_key,.-RC4_set_key
.globl RC4_options
.type RC4_options,\@abi-omnipotent
.align 16
RC4_options:
lea .Lopts(%rip),%rax
mov OPENSSL_ia32cap_P(%rip),%edx
bt \$20,%edx
jnc .Ldone
add \$12,%rax
bt \$30,%edx
jnc .Ldone
add \$13,%rax
.Ldone:
ret
.align 64
.Lopts:
.asciz "rc4(8x,int)"
.asciz "rc4(8x,char)"
.asciz "rc4(1x,char)"
.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
.size RC4_options,.-RC4_options
___
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type stream_se_handler,\@abi-omnipotent
.align 16
stream_se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
lea .Lprologue(%rip),%r10
cmp %r10,%rbx # context->Rip<prologue label
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
lea .Lepilogue(%rip),%r10
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
lea 24(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%r12
mov -24(%rax),%r13
mov %rbx,144($context) # restore context->Rbx
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
.Lin_prologue:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
jmp .Lcommon_seh_exit
.size stream_se_handler,.-stream_se_handler
.type key_se_handler,\@abi-omnipotent
.align 16
key_se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 152($context),%rax # pull context->Rsp
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
.Lcommon_seh_exit:
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size key_se_handler,.-key_se_handler
.section .pdata
.align 4
.rva .LSEH_begin_RC4
.rva .LSEH_end_RC4
.rva .LSEH_info_RC4
.rva .LSEH_begin_RC4_set_key
.rva .LSEH_end_RC4_set_key
.rva .LSEH_info_RC4_set_key
.section .xdata
.align 8
.LSEH_info_RC4:
.byte 9,0,0,0
.rva stream_se_handler
.LSEH_info_RC4_set_key:
.byte 9,0,0,0
.rva key_se_handler
___
}
$code =~ s/#([bwd])/$1/gm;
print $code;
close STDOUT;
@@ -0,0 +1,193 @@
/* crypto/rc4/rc4.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <openssl/rc4.h>
#include <openssl/evp.h>
char *usage[]={
"usage: rc4 args\n",
"\n",
" -in arg - input file - default stdin\n",
" -out arg - output file - default stdout\n",
" -key key - password\n",
NULL
};
int main(int argc, char *argv[])
{
FILE *in=NULL,*out=NULL;
char *infile=NULL,*outfile=NULL,*keystr=NULL;
RC4_KEY key;
char buf[BUFSIZ];
int badops=0,i;
char **pp;
unsigned char md[MD5_DIGEST_LENGTH];
argc--;
argv++;
while (argc >= 1)
{
if (strcmp(*argv,"-in") == 0)
{
if (--argc < 1) goto bad;
infile= *(++argv);
}
else if (strcmp(*argv,"-out") == 0)
{
if (--argc < 1) goto bad;
outfile= *(++argv);
}
else if (strcmp(*argv,"-key") == 0)
{
if (--argc < 1) goto bad;
keystr= *(++argv);
}
else
{
fprintf(stderr,"unknown option %s\n",*argv);
badops=1;
break;
}
argc--;
argv++;
}
if (badops)
{
bad:
for (pp=usage; (*pp != NULL); pp++)
fprintf(stderr,"%s",*pp);
exit(1);
}
if (infile == NULL)
in=stdin;
else
{
in=fopen(infile,"r");
if (in == NULL)
{
perror("open");
exit(1);
}
}
if (outfile == NULL)
out=stdout;
else
{
out=fopen(outfile,"w");
if (out == NULL)
{
perror("open");
exit(1);
}
}
#ifdef OPENSSL_SYS_MSDOS
/* This should set the file to binary mode. */
{
#include <fcntl.h>
setmode(fileno(in),O_BINARY);
setmode(fileno(out),O_BINARY);
}
#endif
if (keystr == NULL)
{ /* get key */
i=EVP_read_pw_string(buf,BUFSIZ,"Enter RC4 password:",0);
if (i != 0)
{
OPENSSL_cleanse(buf,BUFSIZ);
fprintf(stderr,"bad password read\n");
exit(1);
}
keystr=buf;
}
EVP_Digest((unsigned char *)keystr,strlen(keystr),md,NULL,EVP_md5(),NULL);
OPENSSL_cleanse(keystr,strlen(keystr));
RC4_set_key(&key,MD5_DIGEST_LENGTH,md);
for(;;)
{
i=fread(buf,1,BUFSIZ,in);
if (i == 0) break;
if (i < 0)
{
perror("read");
exit(1);
}
RC4(&key,(unsigned int)i,(unsigned char *)buf,
(unsigned char *)buf);
i=fwrite(buf,(unsigned int)i,1,out);
if (i != 1)
{
perror("write");
exit(1);
}
}
fclose(out);
fclose(in);
exit(0);
return(1);
}
@@ -0,0 +1,89 @@
/* crypto/rc4/rc4.h */
/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#ifndef HEADER_RC4_H
#define HEADER_RC4_H
#include <openssl/opensslconf.h> /* OPENSSL_NO_RC4, RC4_INT */
#ifdef OPENSSL_NO_RC4
#error RC4 is disabled.
#endif
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct rc4_key_st
{
RC4_INT x,y;
RC4_INT data[256];
} RC4_KEY;
const char *RC4_options(void);
void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
unsigned char *outdata);
#ifdef __cplusplus
}
#endif
#endif
@@ -0,0 +1,315 @@
/* crypto/rc4/rc4_enc.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <openssl/rc4.h>
#include "rc4_locl.h"
/* RC4 as implemented from a posting from
* Newsgroups: sci.crypt
* From: sterndark@netcom.com (David Sterndark)
* Subject: RC4 Algorithm revealed.
* Message-ID: <sternCvKL4B.Hyy@netcom.com>
* Date: Wed, 14 Sep 1994 06:35:31 GMT
*/
void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
unsigned char *outdata)
{
register RC4_INT *d;
register RC4_INT x,y,tx,ty;
size_t i;
x=key->x;
y=key->y;
d=key->data;
#if defined(RC4_CHUNK)
/*
* The original reason for implementing this(*) was the fact that
* pre-21164a Alpha CPUs don't have byte load/store instructions
* and e.g. a byte store has to be done with 64-bit load, shift,
* and, or and finally 64-bit store. Peaking data and operating
* at natural word size made it possible to reduce amount of
* instructions as well as to perform early read-ahead without
* suffering from RAW (read-after-write) hazard. This resulted
* in ~40%(**) performance improvement on 21064 box with gcc.
* But it's not only Alpha users who win here:-) Thanks to the
* early-n-wide read-ahead this implementation also exhibits
* >40% speed-up on SPARC and 20-30% on 64-bit MIPS (depending
* on sizeof(RC4_INT)).
*
* (*) "this" means code which recognizes the case when input
* and output pointers appear to be aligned at natural CPU
* word boundary
* (**) i.e. according to 'apps/openssl speed rc4' benchmark,
* crypto/rc4/rc4speed.c exhibits almost 70% speed-up...
*
* Cavets.
*
* - RC4_CHUNK="unsigned long long" should be a #1 choice for
* UltraSPARC. Unfortunately gcc generates very slow code
* (2.5-3 times slower than one generated by Sun's WorkShop
* C) and therefore gcc (at least 2.95 and earlier) should
* always be told that RC4_CHUNK="unsigned long".
*
* <appro@fy.chalmers.se>
*/
# define RC4_STEP ( \
x=(x+1) &0xff, \
tx=d[x], \
y=(tx+y)&0xff, \
ty=d[y], \
d[y]=tx, \
d[x]=ty, \
(RC4_CHUNK)d[(tx+ty)&0xff]\
)
if ( ( ((size_t)indata & (sizeof(RC4_CHUNK)-1)) |
((size_t)outdata & (sizeof(RC4_CHUNK)-1)) ) == 0 )
{
RC4_CHUNK ichunk,otp;
const union { long one; char little; } is_endian = {1};
/*
* I reckon we can afford to implement both endian
* cases and to decide which way to take at run-time
* because the machine code appears to be very compact
* and redundant 1-2KB is perfectly tolerable (i.e.
* in case the compiler fails to eliminate it:-). By
* suggestion from Terrel Larson <terr@terralogic.net>
* who also stands for the is_endian union:-)
*
* Special notes.
*
* - is_endian is declared automatic as doing otherwise
* (declaring static) prevents gcc from eliminating
* the redundant code;
* - compilers (those I've tried) don't seem to have
* problems eliminating either the operators guarded
* by "if (sizeof(RC4_CHUNK)==8)" or the condition
* expressions themselves so I've got 'em to replace
* corresponding #ifdefs from the previous version;
* - I chose to let the redundant switch cases when
* sizeof(RC4_CHUNK)!=8 be (were also #ifdefed
* before);
* - in case you wonder "&(sizeof(RC4_CHUNK)*8-1)" in
* [LB]ESHFT guards against "shift is out of range"
* warnings when sizeof(RC4_CHUNK)!=8
*
* <appro@fy.chalmers.se>
*/
if (!is_endian.little)
{ /* BIG-ENDIAN CASE */
# define BESHFT(c) (((sizeof(RC4_CHUNK)-(c)-1)*8)&(sizeof(RC4_CHUNK)*8-1))
for (;len&(0-sizeof(RC4_CHUNK));len-=sizeof(RC4_CHUNK))
{
ichunk = *(RC4_CHUNK *)indata;
otp = RC4_STEP<<BESHFT(0);
otp |= RC4_STEP<<BESHFT(1);
otp |= RC4_STEP<<BESHFT(2);
otp |= RC4_STEP<<BESHFT(3);
if (sizeof(RC4_CHUNK)==8)
{
otp |= RC4_STEP<<BESHFT(4);
otp |= RC4_STEP<<BESHFT(5);
otp |= RC4_STEP<<BESHFT(6);
otp |= RC4_STEP<<BESHFT(7);
}
*(RC4_CHUNK *)outdata = otp^ichunk;
indata += sizeof(RC4_CHUNK);
outdata += sizeof(RC4_CHUNK);
}
if (len)
{
RC4_CHUNK mask=(RC4_CHUNK)-1, ochunk;
ichunk = *(RC4_CHUNK *)indata;
ochunk = *(RC4_CHUNK *)outdata;
otp = 0;
i = BESHFT(0);
mask <<= (sizeof(RC4_CHUNK)-len)<<3;
switch (len&(sizeof(RC4_CHUNK)-1))
{
case 7: otp = RC4_STEP<<i, i-=8;
case 6: otp |= RC4_STEP<<i, i-=8;
case 5: otp |= RC4_STEP<<i, i-=8;
case 4: otp |= RC4_STEP<<i, i-=8;
case 3: otp |= RC4_STEP<<i, i-=8;
case 2: otp |= RC4_STEP<<i, i-=8;
case 1: otp |= RC4_STEP<<i, i-=8;
case 0: ; /*
* it's never the case,
* but it has to be here
* for ultrix?
*/
}
ochunk &= ~mask;
ochunk |= (otp^ichunk) & mask;
*(RC4_CHUNK *)outdata = ochunk;
}
key->x=x;
key->y=y;
return;
}
else
{ /* LITTLE-ENDIAN CASE */
# define LESHFT(c) (((c)*8)&(sizeof(RC4_CHUNK)*8-1))
for (;len&(0-sizeof(RC4_CHUNK));len-=sizeof(RC4_CHUNK))
{
ichunk = *(RC4_CHUNK *)indata;
otp = RC4_STEP;
otp |= RC4_STEP<<8;
otp |= RC4_STEP<<16;
otp |= RC4_STEP<<24;
if (sizeof(RC4_CHUNK)==8)
{
otp |= RC4_STEP<<LESHFT(4);
otp |= RC4_STEP<<LESHFT(5);
otp |= RC4_STEP<<LESHFT(6);
otp |= RC4_STEP<<LESHFT(7);
}
*(RC4_CHUNK *)outdata = otp^ichunk;
indata += sizeof(RC4_CHUNK);
outdata += sizeof(RC4_CHUNK);
}
if (len)
{
RC4_CHUNK mask=(RC4_CHUNK)-1, ochunk;
ichunk = *(RC4_CHUNK *)indata;
ochunk = *(RC4_CHUNK *)outdata;
otp = 0;
i = 0;
mask >>= (sizeof(RC4_CHUNK)-len)<<3;
switch (len&(sizeof(RC4_CHUNK)-1))
{
case 7: otp = RC4_STEP, i+=8;
case 6: otp |= RC4_STEP<<i, i+=8;
case 5: otp |= RC4_STEP<<i, i+=8;
case 4: otp |= RC4_STEP<<i, i+=8;
case 3: otp |= RC4_STEP<<i, i+=8;
case 2: otp |= RC4_STEP<<i, i+=8;
case 1: otp |= RC4_STEP<<i, i+=8;
case 0: ; /*
* it's never the case,
* but it has to be here
* for ultrix?
*/
}
ochunk &= ~mask;
ochunk |= (otp^ichunk) & mask;
*(RC4_CHUNK *)outdata = ochunk;
}
key->x=x;
key->y=y;
return;
}
}
#endif
#define LOOP(in,out) \
x=((x+1)&0xff); \
tx=d[x]; \
y=(tx+y)&0xff; \
d[x]=ty=d[y]; \
d[y]=tx; \
(out) = d[(tx+ty)&0xff]^ (in);
#ifndef RC4_INDEX
#define RC4_LOOP(a,b,i) LOOP(*((a)++),*((b)++))
#else
#define RC4_LOOP(a,b,i) LOOP(a[i],b[i])
#endif
i=len>>3;
if (i)
{
for (;;)
{
RC4_LOOP(indata,outdata,0);
RC4_LOOP(indata,outdata,1);
RC4_LOOP(indata,outdata,2);
RC4_LOOP(indata,outdata,3);
RC4_LOOP(indata,outdata,4);
RC4_LOOP(indata,outdata,5);
RC4_LOOP(indata,outdata,6);
RC4_LOOP(indata,outdata,7);
#ifdef RC4_INDEX
indata+=8;
outdata+=8;
#endif
if (--i == 0) break;
}
}
i=len&0x07;
if (i)
{
for (;;)
{
RC4_LOOP(indata,outdata,0); if (--i == 0) break;
RC4_LOOP(indata,outdata,1); if (--i == 0) break;
RC4_LOOP(indata,outdata,2); if (--i == 0) break;
RC4_LOOP(indata,outdata,3); if (--i == 0) break;
RC4_LOOP(indata,outdata,4); if (--i == 0) break;
RC4_LOOP(indata,outdata,5); if (--i == 0) break;
RC4_LOOP(indata,outdata,6); if (--i == 0) break;
}
}
key->x=x;
key->y=y;
}
@@ -0,0 +1,5 @@
#ifndef HEADER_RC4_LOCL_H
#define HEADER_RC4_LOCL_H
#include <openssl/opensslconf.h>
#include <cryptlib.h>
#endif
@@ -0,0 +1,150 @@
/* crypto/rc4/rc4_skey.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <openssl/rc4.h>
#include "rc4_locl.h"
#include <openssl/opensslv.h>
const char RC4_version[]="RC4" OPENSSL_VERSION_PTEXT;
const char *RC4_options(void)
{
#ifdef RC4_INDEX
if (sizeof(RC4_INT) == 1)
return("rc4(idx,char)");
else
return("rc4(idx,int)");
#else
if (sizeof(RC4_INT) == 1)
return("rc4(ptr,char)");
else
return("rc4(ptr,int)");
#endif
}
/* RC4 as implemented from a posting from
* Newsgroups: sci.crypt
* From: sterndark@netcom.com (David Sterndark)
* Subject: RC4 Algorithm revealed.
* Message-ID: <sternCvKL4B.Hyy@netcom.com>
* Date: Wed, 14 Sep 1994 06:35:31 GMT
*/
void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
{
register RC4_INT tmp;
register int id1,id2;
register RC4_INT *d;
unsigned int i;
d= &(key->data[0]);
key->x = 0;
key->y = 0;
id1=id2=0;
#define SK_LOOP(d,n) { \
tmp=d[(n)]; \
id2 = (data[id1] + tmp + id2) & 0xff; \
if (++id1 == len) id1=0; \
d[(n)]=d[id2]; \
d[id2]=tmp; }
#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM)
# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
defined(__INTEL__) || \
defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64)
if (sizeof(RC4_INT) > 1) {
/*
* Unlike all other x86 [and x86_64] implementations,
* Intel P4 core [including EM64T] was found to perform
* poorly with wider RC4_INT. Performance improvement
* for IA-32 hand-coded assembler turned out to be 2.8x
* if re-coded for RC4_CHAR! It's however inappropriate
* to just switch to RC4_CHAR for x86[_64], as non-P4
* implementations suffer from significant performance
* losses then, e.g. PIII exhibits >2x deterioration,
* and so does Opteron. In order to assure optimal
* all-round performance, let us [try to] detect P4 at
* run-time by checking upon HTT bit in CPU capability
* vector and set up compressed key schedule, which is
* recognized by correspondingly updated assembler
* module...
* <appro@fy.chalmers.se>
*/
if (OPENSSL_ia32cap_P & (1<<28)) {
unsigned char *cp=(unsigned char *)d;
for (i=0;i<256;i++) cp[i]=i;
for (i=0;i<256;i++) SK_LOOP(cp,i);
/* mark schedule as compressed! */
d[256/sizeof(RC4_INT)]=-1;
return;
}
}
# endif
#endif
for (i=0; i < 256; i++) d[i]=i;
for (i=0; i < 256; i+=4)
{
SK_LOOP(d,i+0);
SK_LOOP(d,i+1);
SK_LOOP(d,i+2);
SK_LOOP(d,i+3);
}
}
@@ -0,0 +1,73 @@
//
// gettsc.inl
//
// gives access to the Pentium's (secret) cycle counter
//
// This software was written by Leonard Janke (janke@unixg.ubc.ca)
// in 1996-7 and is entered, by him, into the public domain.
#if defined(__WATCOMC__)
void GetTSC(unsigned long&);
#pragma aux GetTSC = 0x0f 0x31 "mov [edi], eax" parm [edi] modify [edx eax];
#elif defined(__GNUC__)
inline
void GetTSC(unsigned long& tsc)
{
asm volatile(".byte 15, 49\n\t"
: "=eax" (tsc)
:
: "%edx", "%eax");
}
#elif defined(_MSC_VER)
inline
void GetTSC(unsigned long& tsc)
{
unsigned long a;
__asm _emit 0fh
__asm _emit 31h
__asm mov a, eax;
tsc=a;
}
#endif
#include <stdio.h>
#include <stdlib.h>
#include <openssl/rc4.h>
void main(int argc,char *argv[])
{
unsigned char buffer[1024];
RC4_KEY ctx;
unsigned long s1,s2,e1,e2;
unsigned char k[16];
unsigned long data[2];
unsigned char iv[8];
int i,num=64,numm;
int j=0;
if (argc >= 2)
num=atoi(argv[1]);
if (num == 0) num=256;
if (num > 1024-16) num=1024-16;
numm=num+8;
for (j=0; j<6; j++)
{
for (i=0; i<10; i++) /**/
{
RC4(&ctx,numm,buffer,buffer);
GetTSC(s1);
RC4(&ctx,numm,buffer,buffer);
GetTSC(e1);
GetTSC(s2);
RC4(&ctx,num,buffer,buffer);
GetTSC(e2);
RC4(&ctx,num,buffer,buffer);
}
printf("RC4 (%d bytes) %d %d (%d) - 8 bytes\n",num,
e1-s1,e2-s2,(e1-s1)-(e2-s2));
}
}
@@ -0,0 +1,253 @@
/* crypto/rc4/rc4speed.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
/* 11-Sep-92 Andrew Daviel Support for Silicon Graphics IRIX added */
/* 06-Apr-92 Luke Brennan Support for VMS and add extra signal calls */
#if !defined(OPENSSL_SYS_MSDOS) && (!defined(OPENSSL_SYS_VMS) || defined(__DECC)) && !defined(OPENSSL_SYS_MACOSX)
#define TIMES
#endif
#include <stdio.h>
#include <openssl/e_os2.h>
#include OPENSSL_UNISTD_IO
OPENSSL_DECLARE_EXIT
#ifndef OPENSSL_SYS_NETWARE
#include <signal.h>
#endif
#ifndef _IRIX
#include <time.h>
#endif
#ifdef TIMES
#include <sys/types.h>
#include <sys/times.h>
#endif
/* Depending on the VMS version, the tms structure is perhaps defined.
The __TMS macro will show if it was. If it wasn't defined, we should
undefine TIMES, since that tells the rest of the program how things
should be handled. -- Richard Levitte */
#if defined(OPENSSL_SYS_VMS_DECC) && !defined(__TMS)
#undef TIMES
#endif
#ifndef TIMES
#include <sys/timeb.h>
#endif
#if defined(sun) || defined(__ultrix)
#define _POSIX_SOURCE
#include <limits.h>
#include <sys/param.h>
#endif
#include <openssl/rc4.h>
/* The following if from times(3) man page. It may need to be changed */
#ifndef HZ
#ifndef CLK_TCK
#define HZ 100.0
#else /* CLK_TCK */
#define HZ ((double)CLK_TCK)
#endif
#endif
#define BUFSIZE ((long)1024)
long run=0;
double Time_F(int s);
#ifdef SIGALRM
#if defined(__STDC__) || defined(sgi) || defined(_AIX)
#define SIGRETTYPE void
#else
#define SIGRETTYPE int
#endif
SIGRETTYPE sig_done(int sig);
SIGRETTYPE sig_done(int sig)
{
signal(SIGALRM,sig_done);
run=0;
#ifdef LINT
sig=sig;
#endif
}
#endif
#define START 0
#define STOP 1
double Time_F(int s)
{
double ret;
#ifdef TIMES
static struct tms tstart,tend;
if (s == START)
{
times(&tstart);
return(0);
}
else
{
times(&tend);
ret=((double)(tend.tms_utime-tstart.tms_utime))/HZ;
return((ret == 0.0)?1e-6:ret);
}
#else /* !times() */
static struct timeb tstart,tend;
long i;
if (s == START)
{
ftime(&tstart);
return(0);
}
else
{
ftime(&tend);
i=(long)tend.millitm-(long)tstart.millitm;
ret=((double)(tend.time-tstart.time))+((double)i)/1e3;
return((ret == 0.0)?1e-6:ret);
}
#endif
}
int main(int argc, char **argv)
{
long count;
static unsigned char buf[BUFSIZE];
static unsigned char key[] ={
0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,
0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10,
};
RC4_KEY sch;
double a,b,c,d;
#ifndef SIGALRM
long ca,cb,cc;
#endif
#ifndef TIMES
printf("To get the most accurate results, try to run this\n");
printf("program when this computer is idle.\n");
#endif
#ifndef SIGALRM
printf("First we calculate the approximate speed ...\n");
RC4_set_key(&sch,16,key);
count=10;
do {
long i;
unsigned long data[2];
count*=2;
Time_F(START);
for (i=count; i; i--)
RC4(&sch,8,buf,buf);
d=Time_F(STOP);
} while (d < 3.0);
ca=count/512;
cc=count*8/BUFSIZE+1;
printf("Doing RC4_set_key %ld times\n",ca);
#define COND(d) (count != (d))
#define COUNT(d) (d)
#else
#define COND(c) (run)
#define COUNT(d) (count)
signal(SIGALRM,sig_done);
printf("Doing RC4_set_key for 10 seconds\n");
alarm(10);
#endif
Time_F(START);
for (count=0,run=1; COND(ca); count+=4)
{
RC4_set_key(&sch,16,key);
RC4_set_key(&sch,16,key);
RC4_set_key(&sch,16,key);
RC4_set_key(&sch,16,key);
}
d=Time_F(STOP);
printf("%ld RC4_set_key's in %.2f seconds\n",count,d);
a=((double)COUNT(ca))/d;
#ifdef SIGALRM
printf("Doing RC4 on %ld byte blocks for 10 seconds\n",BUFSIZE);
alarm(10);
#else
printf("Doing RC4 %ld times on %ld byte blocks\n",cc,BUFSIZE);
#endif
Time_F(START);
for (count=0,run=1; COND(cc); count++)
RC4(&sch,BUFSIZE,buf,buf);
d=Time_F(STOP);
printf("%ld RC4's of %ld byte blocks in %.2f second\n",
count,BUFSIZE,d);
c=((double)COUNT(cc)*BUFSIZE)/d;
printf("RC4 set_key per sec = %12.2f (%9.3fuS)\n",a,1.0e6/a);
printf("RC4 bytes per sec = %12.2f (%9.3fuS)\n",c,8.0e6/c);
exit(0);
#if defined(LINT) || defined(OPENSSL_SYS_MSDOS)
return(0);
#endif
}
@@ -0,0 +1,236 @@
/* crypto/rc4/rc4test.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../e_os.h"
#ifdef OPENSSL_NO_RC4
int main(int argc, char *argv[])
{
printf("No RC4 support\n");
return(0);
}
#else
#include <openssl/rc4.h>
#include <openssl/sha.h>
static unsigned char keys[7][30]={
{8,0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef},
{8,0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef},
{8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},
{4,0xef,0x01,0x23,0x45},
{8,0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef},
{4,0xef,0x01,0x23,0x45},
};
static unsigned char data_len[7]={8,8,8,20,28,10};
static unsigned char data[7][30]={
{0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,0xff},
{0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff},
{0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff},
{0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0xff},
{0x12,0x34,0x56,0x78,0x9A,0xBC,0xDE,0xF0,
0x12,0x34,0x56,0x78,0x9A,0xBC,0xDE,0xF0,
0x12,0x34,0x56,0x78,0x9A,0xBC,0xDE,0xF0,
0x12,0x34,0x56,0x78,0xff},
{0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff},
{0},
};
static unsigned char output[7][30]={
{0x75,0xb7,0x87,0x80,0x99,0xe0,0xc5,0x96,0x00},
{0x74,0x94,0xc2,0xe7,0x10,0x4b,0x08,0x79,0x00},
{0xde,0x18,0x89,0x41,0xa3,0x37,0x5d,0x3a,0x00},
{0xd6,0xa1,0x41,0xa7,0xec,0x3c,0x38,0xdf,
0xbd,0x61,0x5a,0x11,0x62,0xe1,0xc7,0xba,
0x36,0xb6,0x78,0x58,0x00},
{0x66,0xa0,0x94,0x9f,0x8a,0xf7,0xd6,0x89,
0x1f,0x7f,0x83,0x2b,0xa8,0x33,0xc0,0x0c,
0x89,0x2e,0xbe,0x30,0x14,0x3c,0xe2,0x87,
0x40,0x01,0x1e,0xcf,0x00},
{0xd6,0xa1,0x41,0xa7,0xec,0x3c,0x38,0xdf,0xbd,0x61,0x00},
{0},
};
int main(int argc, char *argv[])
{
int i,err=0;
int j;
unsigned char *p;
RC4_KEY key;
unsigned char obuf[512];
for (i=0; i<6; i++)
{
RC4_set_key(&key,keys[i][0],&(keys[i][1]));
memset(obuf,0x00,sizeof(obuf));
RC4(&key,data_len[i],&(data[i][0]),obuf);
if (memcmp(obuf,output[i],data_len[i]+1) != 0)
{
printf("error calculating RC4\n");
printf("output:");
for (j=0; j<data_len[i]+1; j++)
printf(" %02x",obuf[j]);
printf("\n");
printf("expect:");
p= &(output[i][0]);
for (j=0; j<data_len[i]+1; j++)
printf(" %02x",*(p++));
printf("\n");
err++;
}
else
printf("test %d ok\n",i);
}
printf("test end processing ");
for (i=0; i<data_len[3]; i++)
{
RC4_set_key(&key,keys[3][0],&(keys[3][1]));
memset(obuf,0x00,sizeof(obuf));
RC4(&key,i,&(data[3][0]),obuf);
if ((memcmp(obuf,output[3],i) != 0) || (obuf[i] != 0))
{
printf("error in RC4 length processing\n");
printf("output:");
for (j=0; j<i+1; j++)
printf(" %02x",obuf[j]);
printf("\n");
printf("expect:");
p= &(output[3][0]);
for (j=0; j<i; j++)
printf(" %02x",*(p++));
printf(" 00\n");
err++;
}
else
{
printf(".");
fflush(stdout);
}
}
printf("done\n");
printf("test multi-call ");
for (i=0; i<data_len[3]; i++)
{
RC4_set_key(&key,keys[3][0],&(keys[3][1]));
memset(obuf,0x00,sizeof(obuf));
RC4(&key,i,&(data[3][0]),obuf);
RC4(&key,data_len[3]-i,&(data[3][i]),&(obuf[i]));
if (memcmp(obuf,output[3],data_len[3]+1) != 0)
{
printf("error in RC4 multi-call processing\n");
printf("output:");
for (j=0; j<data_len[3]+1; j++)
printf(" %02x",obuf[j]);
printf("\n");
printf("expect:");
p= &(output[3][0]);
for (j=0; j<data_len[3]+1; j++)
printf(" %02x",*(p++));
err++;
}
else
{
printf(".");
fflush(stdout);
}
}
printf("done\n");
printf("bulk test ");
{ unsigned char buf[513];
SHA_CTX c;
unsigned char md[SHA_DIGEST_LENGTH];
static unsigned char expected[]={
0xa4,0x7b,0xcc,0x00,0x3d,0xd0,0xbd,0xe1,0xac,0x5f,
0x12,0x1e,0x45,0xbc,0xfb,0x1a,0xa1,0xf2,0x7f,0xc5 };
RC4_set_key(&key,keys[0][0],&(keys[3][1]));
memset(buf,'\0',sizeof(buf));
SHA1_Init(&c);
for (i=0;i<2571;i++) {
RC4(&key,sizeof(buf),buf,buf);
SHA1_Update(&c,buf,sizeof(buf));
}
SHA1_Final(md,&c);
if (memcmp(md,expected,sizeof(md))) {
printf("error in RC4 bulk test\n");
printf("output:");
for (j=0; j<(int)sizeof(md); j++)
printf(" %02x",md[j]);
printf("\n");
printf("expect:");
for (j=0; j<(int)sizeof(md); j++)
printf(" %02x",expected[j]);
printf("\n");
err++;
}
else printf("ok\n");
}
#ifdef OPENSSL_SYS_NETWARE
if (err) printf("ERROR: %d\n", err);
#endif
EXIT(err);
return(0);
}
#endif
@@ -0,0 +1,278 @@
Newsgroups: sci.crypt,alt.security,comp.security.misc,alt.privacy
Path: ghost.dsi.unimi.it!univ-lyon1.fr!jussieu.fr!zaphod.crihan.fr!warwick!clyde.open.ac.uk!strath-cs!bnr.co.uk!bt!pipex!howland.reston.ans.net!europa.eng.gtefsd.com!MathWorks.Com!yeshua.marcam.com!charnel.ecst.csuchico.edu!csusac!csus.edu!netcom.com!sterndark
From: sterndark@netcom.com (David Sterndark)
Subject: RC4 Algorithm revealed.
Message-ID: <sternCvKL4B.Hyy@netcom.com>
Sender: sterndark@netcom.com
Organization: NETCOM On-line Communication Services (408 261-4700 guest)
X-Newsreader: TIN [version 1.2 PL1]
Date: Wed, 14 Sep 1994 06:35:31 GMT
Lines: 263
Xref: ghost.dsi.unimi.it sci.crypt:27332 alt.security:14732 comp.security.misc:11701 alt.privacy:16026
I am shocked, shocked, I tell you, shocked, to discover
that the cypherpunks have illegaly and criminally revealed
a crucial RSA trade secret and harmed the security of
America by reverse engineering the RC4 algorithm and
publishing it to the world.
On Saturday morning an anonymous cypherpunk wrote:
SUBJECT: RC4 Source Code
I've tested this. It is compatible with the RC4 object module
that comes in the various RSA toolkits.
/* rc4.h */
typedef struct rc4_key
{
unsigned char state[256];
unsigned char x;
unsigned char y;
} rc4_key;
void prepare_key(unsigned char *key_data_ptr,int key_data_len,
rc4_key *key);
void rc4(unsigned char *buffer_ptr,int buffer_len,rc4_key * key);
/*rc4.c */
#include "rc4.h"
static void swap_byte(unsigned char *a, unsigned char *b);
void prepare_key(unsigned char *key_data_ptr, int key_data_len,
rc4_key *key)
{
unsigned char swapByte;
unsigned char index1;
unsigned char index2;
unsigned char* state;
short counter;
state = &key->state[0];
for(counter = 0; counter < 256; counter++)
state[counter] = counter;
key->x = 0;
key->y = 0;
index1 = 0;
index2 = 0;
for(counter = 0; counter < 256; counter++)
{
index2 = (key_data_ptr[index1] + state[counter] +
index2) % 256;
swap_byte(&state[counter], &state[index2]);
index1 = (index1 + 1) % key_data_len;
}
}
void rc4(unsigned char *buffer_ptr, int buffer_len, rc4_key *key)
{
unsigned char x;
unsigned char y;
unsigned char* state;
unsigned char xorIndex;
short counter;
x = key->x;
y = key->y;
state = &key->state[0];
for(counter = 0; counter < buffer_len; counter ++)
{
x = (x + 1) % 256;
y = (state[x] + y) % 256;
swap_byte(&state[x], &state[y]);
xorIndex = (state[x] + state[y]) % 256;
buffer_ptr[counter] ^= state[xorIndex];
}
key->x = x;
key->y = y;
}
static void swap_byte(unsigned char *a, unsigned char *b)
{
unsigned char swapByte;
swapByte = *a;
*a = *b;
*b = swapByte;
}
Another cypherpunk, this one not anonymous, tested the
output from this algorithm against the output from
official RC4 object code
Date: Tue, 13 Sep 94 18:37:56 PDT
From: ekr@eit.COM (Eric Rescorla)
Message-Id: <9409140137.AA17743@eitech.eit.com>
Subject: RC4 compatibility testing
Cc: cypherpunks@toad.com
One data point:
I can't say anything about the internals of RC4 versus the
algorithm that Bill Sommerfeld is rightly calling 'Alleged RC4',
since I don't know anything about RC4's internals.
However, I do have a (legitimately acquired) copy of BSAFE2 and
so I'm able to compare the output of this algorithm to the output
of genuine RC4 as found in BSAFE. I chose a set of test vectors
and ran them through both algorithms. The algorithms appear to
give identical results, at least with these key/plaintext pairs.
I note that this is the algorithm _without_ Hal Finney's
proposed modification
(see <199409130605.XAA24133@jobe.shell.portal.com>).
The vectors I used (together with the ciphertext they produce)
follow at the end of this message.
-Ekr
Disclaimer: This posting does not reflect the opinions of EIT.
--------------------results follow--------------
Test vector 0
Key: 0x01 0x23 0x45 0x67 0x89 0xab 0xcd 0xef
Input: 0x01 0x23 0x45 0x67 0x89 0xab 0xcd 0xef
0 Output: 0x75 0xb7 0x87 0x80 0x99 0xe0 0xc5 0x96
Test vector 1
Key: 0x01 0x23 0x45 0x67 0x89 0xab 0xcd 0xef
Input: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
0 Output: 0x74 0x94 0xc2 0xe7 0x10 0x4b 0x08 0x79
Test vector 2
Key: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
Input: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
0 Output: 0xde 0x18 0x89 0x41 0xa3 0x37 0x5d 0x3a
Test vector 3
Key: 0xef 0x01 0x23 0x45
Input: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
0 Output: 0xd6 0xa1 0x41 0xa7 0xec 0x3c 0x38 0xdf 0xbd 0x61
Test vector 4
Key: 0x01 0x23 0x45 0x67 0x89 0xab 0xcd 0xef
Input: 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
0x01
0 Output: 0x75 0x95 0xc3 0xe6 0x11 0x4a 0x09 0x78 0x0c 0x4a 0xd4
0x52 0x33 0x8e 0x1f 0xfd 0x9a 0x1b 0xe9 0x49 0x8f
0x81 0x3d 0x76 0x53 0x34 0x49 0xb6 0x77 0x8d 0xca
0xd8 0xc7 0x8a 0x8d 0x2b 0xa9 0xac 0x66 0x08 0x5d
0x0e 0x53 0xd5 0x9c 0x26 0xc2 0xd1 0xc4 0x90 0xc1
0xeb 0xbe 0x0c 0xe6 0x6d 0x1b 0x6b 0x1b 0x13 0xb6
0xb9 0x19 0xb8 0x47 0xc2 0x5a 0x91 0x44 0x7a 0x95
0xe7 0x5e 0x4e 0xf1 0x67 0x79 0xcd 0xe8 0xbf 0x0a
0x95 0x85 0x0e 0x32 0xaf 0x96 0x89 0x44 0x4f 0xd3
0x77 0x10 0x8f 0x98 0xfd 0xcb 0xd4 0xe7 0x26 0x56
0x75 0x00 0x99 0x0b 0xcc 0x7e 0x0c 0xa3 0xc4 0xaa
0xa3 0x04 0xa3 0x87 0xd2 0x0f 0x3b 0x8f 0xbb 0xcd
0x42 0xa1 0xbd 0x31 0x1d 0x7a 0x43 0x03 0xdd 0xa5
0xab 0x07 0x88 0x96 0xae 0x80 0xc1 0x8b 0x0a 0xf6
0x6d 0xff 0x31 0x96 0x16 0xeb 0x78 0x4e 0x49 0x5a
0xd2 0xce 0x90 0xd7 0xf7 0x72 0xa8 0x17 0x47 0xb6
0x5f 0x62 0x09 0x3b 0x1e 0x0d 0xb9 0xe5 0xba 0x53
0x2f 0xaf 0xec 0x47 0x50 0x83 0x23 0xe6 0x71 0x32
0x7d 0xf9 0x44 0x44 0x32 0xcb 0x73 0x67 0xce 0xc8
0x2f 0x5d 0x44 0xc0 0xd0 0x0b 0x67 0xd6 0x50 0xa0
0x75 0xcd 0x4b 0x70 0xde 0xdd 0x77 0xeb 0x9b 0x10
0x23 0x1b 0x6b 0x5b 0x74 0x13 0x47 0x39 0x6d 0x62
0x89 0x74 0x21 0xd4 0x3d 0xf9 0xb4 0x2e 0x44 0x6e
0x35 0x8e 0x9c 0x11 0xa9 0xb2 0x18 0x4e 0xcb 0xef
0x0c 0xd8 0xe7 0xa8 0x77 0xef 0x96 0x8f 0x13 0x90
0xec 0x9b 0x3d 0x35 0xa5 0x58 0x5c 0xb0 0x09 0x29
0x0e 0x2f 0xcd 0xe7 0xb5 0xec 0x66 0xd9 0x08 0x4b
0xe4 0x40 0x55 0xa6 0x19 0xd9 0xdd 0x7f 0xc3 0x16
0x6f 0x94 0x87 0xf7 0xcb 0x27 0x29 0x12 0x42 0x64
0x45 0x99 0x85 0x14 0xc1 0x5d 0x53 0xa1 0x8c 0x86
0x4c 0xe3 0xa2 0xb7 0x55 0x57 0x93 0x98 0x81 0x26
0x52 0x0e 0xac 0xf2 0xe3 0x06 0x6e 0x23 0x0c 0x91
0xbe 0xe4 0xdd 0x53 0x04 0xf5 0xfd 0x04 0x05 0xb3
0x5b 0xd9 0x9c 0x73 0x13 0x5d 0x3d 0x9b 0xc3 0x35
0xee 0x04 0x9e 0xf6 0x9b 0x38 0x67 0xbf 0x2d 0x7b
0xd1 0xea 0xa5 0x95 0xd8 0xbf 0xc0 0x06 0x6f 0xf8
0xd3 0x15 0x09 0xeb 0x0c 0x6c 0xaa 0x00 0x6c 0x80
0x7a 0x62 0x3e 0xf8 0x4c 0x3d 0x33 0xc1 0x95 0xd2
0x3e 0xe3 0x20 0xc4 0x0d 0xe0 0x55 0x81 0x57 0xc8
0x22 0xd4 0xb8 0xc5 0x69 0xd8 0x49 0xae 0xd5 0x9d
0x4e 0x0f 0xd7 0xf3 0x79 0x58 0x6b 0x4b 0x7f 0xf6
0x84 0xed 0x6a 0x18 0x9f 0x74 0x86 0xd4 0x9b 0x9c
0x4b 0xad 0x9b 0xa2 0x4b 0x96 0xab 0xf9 0x24 0x37
0x2c 0x8a 0x8f 0xff 0xb1 0x0d 0x55 0x35 0x49 0x00
0xa7 0x7a 0x3d 0xb5 0xf2 0x05 0xe1 0xb9 0x9f 0xcd
0x86 0x60 0x86 0x3a 0x15 0x9a 0xd4 0xab 0xe4 0x0f
0xa4 0x89 0x34 0x16 0x3d 0xdd 0xe5 0x42 0xa6 0x58
0x55 0x40 0xfd 0x68 0x3c 0xbf 0xd8 0xc0 0x0f 0x12
0x12 0x9a 0x28 0x4d 0xea 0xcc 0x4c 0xde 0xfe 0x58
0xbe 0x71 0x37 0x54 0x1c 0x04 0x71 0x26 0xc8 0xd4
0x9e 0x27 0x55 0xab 0x18 0x1a 0xb7 0xe9 0x40 0xb0
0xc0
--
---------------------------------------------------------------------
We have the right to defend ourselves and our
property, because of the kind of animals that we James A. Donald
are. True law derives from this right, not from
the arbitrary power of the omnipotent state. jamesd@netcom.com