Skip to content

Commit 7c6ef2f

Browse files
Nicolas PitreJunio C Hamano
authored andcommitted
[PATCH] ARM optimized SHA1 implementation
This is my ARM assembly SHA1 implementation for GIT. It is approximately 50% faster than the generic C version. On an XScale processor running at 400MHz: generic C version: 9.8 MB/s my version: 14.5 MB/s It's not that I expect a lot of big GIT users on ARM, but I stillknow about one important ARM user that might benefit from it, and writing that code was fun. I also reworked the makefile a bit so any optimized SHA1 implementations is used regardless of whether NO_OPENSSL is defined or not. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Junio C Hamano <junkio@cox.net>
1 parent f1f0d08 commit 7c6ef2f

File tree

4 files changed

+308
-12
lines changed

4 files changed

+308
-12
lines changed

Makefile

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
# Define PPC_SHA1 environment variable when running make to make use of
1515
# a bundled SHA1 routine optimized for PowerPC.
1616
#
17+
# Define ARM_SHA1 environment variable when running make to make use of
18+
# a bundled SHA1 routine optimized for ARM.
19+
#
1720
# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
1821
#
1922
# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
@@ -162,6 +165,9 @@ ifeq ($(shell uname -s),SunOS)
162165
NEEDS_NSL = YesPlease
163166
PLATFORM_DEFINES += -D__EXTENSIONS__
164167
endif
168+
ifneq (,$(findstring arm,$(shell uname -m)))
169+
ARM_SHA1 = YesPlease
170+
endif
165171

166172
ifndef SHELL_PATH
167173
SHELL_PATH = /bin/sh
@@ -191,18 +197,6 @@ ifdef NEEDS_LIBICONV
191197
else
192198
LIB_4_ICONV =
193199
endif
194-
ifdef MOZILLA_SHA1
195-
SHA1_HEADER = "mozilla-sha1/sha1.h"
196-
LIB_OBJS += mozilla-sha1/sha1.o
197-
else
198-
ifdef PPC_SHA1
199-
SHA1_HEADER = "ppc/sha1.h"
200-
LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
201-
else
202-
SHA1_HEADER = <openssl/sha.h>
203-
LIBS += $(LIB_4_CRYPTO)
204-
endif
205-
endif
206200
ifdef NEEDS_SOCKET
207201
LIBS += -lsocket
208202
SIMPLE_LIB += -lsocket
@@ -216,6 +210,24 @@ ifdef NO_STRCASESTR
216210
LIB_OBJS += compat/strcasestr.o
217211
endif
218212

213+
ifdef PPC_SHA1
214+
SHA1_HEADER = "ppc/sha1.h"
215+
LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
216+
else
217+
ifdef ARM_SHA1
218+
SHA1_HEADER = "arm/sha1.h"
219+
LIB_OBJS += arm/sha1.o arm/sha1_arm.o
220+
else
221+
ifdef MOZILLA_SHA1
222+
SHA1_HEADER = "mozilla-sha1/sha1.h"
223+
LIB_OBJS += mozilla-sha1/sha1.o
224+
else
225+
SHA1_HEADER = <openssl/sha.h>
226+
LIBS += $(LIB_4_CRYPTO)
227+
endif
228+
endif
229+
endif
230+
219231
DEFINES += '-DSHA1_HEADER=$(SHA1_HEADER)'
220232

221233
SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \

arm/sha1.c

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* SHA-1 implementation optimized for ARM
3+
*
4+
* Copyright: (C) 2005 by Nicolas Pitre <nico@cam.org>
5+
* Created: September 17, 2005
6+
*/
7+
8+
#include <string.h>
9+
#include "sha1.h"
10+
11+
extern void sha_transform(uint32_t *hash, const unsigned char *data, uint32_t *W);
12+
13+
void SHA1_Init(SHA_CTX *c)
14+
{
15+
c->len = 0;
16+
c->hash[0] = 0x67452301;
17+
c->hash[1] = 0xefcdab89;
18+
c->hash[2] = 0x98badcfe;
19+
c->hash[3] = 0x10325476;
20+
c->hash[4] = 0xc3d2e1f0;
21+
}
22+
23+
void SHA1_Update(SHA_CTX *c, const void *p, unsigned long n)
24+
{
25+
uint32_t workspace[80];
26+
unsigned int partial;
27+
unsigned long done;
28+
29+
partial = c->len & 0x3f;
30+
c->len += n;
31+
if ((partial + n) >= 64) {
32+
if (partial) {
33+
done = 64 - partial;
34+
memcpy(c->buffer + partial, p, done);
35+
sha_transform(c->hash, c->buffer, workspace);
36+
partial = 0;
37+
} else
38+
done = 0;
39+
while (n >= done + 64) {
40+
sha_transform(c->hash, p + done, workspace);
41+
done += 64;
42+
}
43+
} else
44+
done = 0;
45+
if (n - done)
46+
memcpy(c->buffer + partial, p + done, n - done);
47+
}
48+
49+
void SHA1_Final(unsigned char *hash, SHA_CTX *c)
50+
{
51+
uint64_t bitlen;
52+
uint32_t bitlen_hi, bitlen_lo;
53+
unsigned int i, offset, padlen;
54+
unsigned char bits[8];
55+
static const unsigned char padding[64] = { 0x80, };
56+
57+
bitlen = c->len << 3;
58+
offset = c->len & 0x3f;
59+
padlen = ((offset < 56) ? 56 : (64 + 56)) - offset;
60+
SHA1_Update(c, padding, padlen);
61+
62+
bitlen_hi = bitlen >> 32;
63+
bitlen_lo = bitlen & 0xffffffff;
64+
bits[0] = bitlen_hi >> 24;
65+
bits[1] = bitlen_hi >> 16;
66+
bits[2] = bitlen_hi >> 8;
67+
bits[3] = bitlen_hi;
68+
bits[4] = bitlen_lo >> 24;
69+
bits[5] = bitlen_lo >> 16;
70+
bits[6] = bitlen_lo >> 8;
71+
bits[7] = bitlen_lo;
72+
SHA1_Update(c, bits, 8);
73+
74+
for (i = 0; i < 5; i++) {
75+
uint32_t v = c->hash[i];
76+
hash[0] = v >> 24;
77+
hash[1] = v >> 16;
78+
hash[2] = v >> 8;
79+
hash[3] = v;
80+
hash += 4;
81+
}
82+
}

arm/sha1.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
/*
2+
* SHA-1 implementation optimized for ARM
3+
*
4+
* Copyright: (C) 2005 by Nicolas Pitre <nico@cam.org>
5+
* Created: September 17, 2005
6+
*/
7+
8+
#include <stdint.h>
9+
10+
typedef struct sha_context {
11+
uint64_t len;
12+
uint32_t hash[5];
13+
unsigned char buffer[64];
14+
} SHA_CTX;
15+
16+
void SHA1_Init(SHA_CTX *c);
17+
void SHA1_Update(SHA_CTX *c, const void *p, unsigned long n);
18+
void SHA1_Final(unsigned char *hash, SHA_CTX *c);

arm/sha1_arm.S

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
/*
2+
* SHA transform optimized for ARM
3+
*
4+
* Copyright: (C) 2005 by Nicolas Pitre <nico@cam.org>
5+
* Created: September 17, 2005
6+
*
7+
* This program is free software; you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License version 2 as
9+
* published by the Free Software Foundation.
10+
*/
11+
12+
.text
13+
.globl sha_transform
14+
15+
/*
16+
* void sha_transform(uint32_t *hash, const unsigned char *data, uint32_t *W);
17+
*
18+
* note: the "data" pointer may be unaligned.
19+
*/
20+
21+
sha_transform:
22+
23+
stmfd sp!, {r4 - r8, lr}
24+
25+
@ for (i = 0; i < 16; i++)
26+
@ W[i] = ntohl(((uint32_t *)data)[i]); */
27+
28+
#ifdef __ARMEB__
29+
mov r4, r0
30+
mov r0, r2
31+
mov r2, #64
32+
bl memcpy
33+
mov r2, r0
34+
mov r0, r4
35+
#else
36+
mov r3, r2
37+
mov lr, #16
38+
1: ldrb r4, [r1], #1
39+
ldrb r5, [r1], #1
40+
ldrb r6, [r1], #1
41+
ldrb r7, [r1], #1
42+
subs lr, lr, #1
43+
orr r5, r5, r4, lsl #8
44+
orr r6, r6, r5, lsl #8
45+
orr r7, r7, r6, lsl #8
46+
str r7, [r3], #4
47+
bne 1b
48+
#endif
49+
50+
@ for (i = 0; i < 64; i++)
51+
@ W[i+16] = ror(W[i+13] ^ W[i+8] ^ W[i+2] ^ W[i], 31);
52+
53+
sub r3, r2, #4
54+
mov lr, #64
55+
2: ldr r4, [r3, #4]!
56+
subs lr, lr, #1
57+
ldr r5, [r3, #8]
58+
ldr r6, [r3, #32]
59+
ldr r7, [r3, #52]
60+
eor r4, r4, r5
61+
eor r4, r4, r6
62+
eor r4, r4, r7
63+
mov r4, r4, ror #31
64+
str r4, [r3, #64]
65+
bne 2b
66+
67+
/*
68+
* The SHA functions are:
69+
*
70+
* f1(B,C,D) = (D ^ (B & (C ^ D)))
71+
* f2(B,C,D) = (B ^ C ^ D)
72+
* f3(B,C,D) = ((B & C) | (D & (B | C)))
73+
*
74+
* Then the sub-blocks are processed as follows:
75+
*
76+
* A' = ror(A, 27) + f(B,C,D) + E + K + *W++
77+
* B' = A
78+
* C' = ror(B, 2)
79+
* D' = C
80+
* E' = D
81+
*
82+
* We therefore unroll each loop 5 times to avoid register shuffling.
83+
* Also the ror for C (and also D and E which are successivelyderived
84+
* from it) is applied in place to cut on an additional mov insn for
85+
* each round.
86+
*/
87+
88+
.macro sha_f1, A, B, C, D, E
89+
ldr r3, [r2], #4
90+
eor ip, \C, \D
91+
add \E, r1, \E, ror #2
92+
and ip, \B, ip, ror #2
93+
add \E, \E, \A, ror #27
94+
eor ip, ip, \D, ror #2
95+
add \E, \E, r3
96+
add \E, \E, ip
97+
.endm
98+
99+
.macro sha_f2, A, B, C, D, E
100+
ldr r3, [r2], #4
101+
add \E, r1, \E, ror #2
102+
eor ip, \B, \C, ror #2
103+
add \E, \E, \A, ror #27
104+
eor ip, ip, \D, ror #2
105+
add \E, \E, r3
106+
add \E, \E, ip
107+
.endm
108+
109+
.macro sha_f3, A, B, C, D, E
110+
ldr r3, [r2], #4
111+
add \E, r1, \E, ror #2
112+
orr ip, \B, \C, ror #2
113+
add \E, \E, \A, ror #27
114+
and ip, ip, \D, ror #2
115+
add \E, \E, r3
116+
and r3, \B, \C, ror #2
117+
orr ip, ip, r3
118+
add \E, \E, ip
119+
.endm
120+
121+
ldmia r0, {r4 - r8}
122+
123+
mov lr, #4
124+
ldr r1, .L_sha_K + 0
125+
126+
/* adjust initial values */
127+
mov r6, r6, ror #30
128+
mov r7, r7, ror #30
129+
mov r8, r8, ror #30
130+
131+
3: subs lr, lr, #1
132+
sha_f1 r4, r5, r6, r7, r8
133+
sha_f1 r8, r4, r5, r6, r7
134+
sha_f1 r7, r8, r4, r5, r6
135+
sha_f1 r6, r7, r8, r4, r5
136+
sha_f1 r5, r6, r7, r8, r4
137+
bne 3b
138+
139+
ldr r1, .L_sha_K + 4
140+
mov lr, #4
141+
142+
4: subs lr, lr, #1
143+
sha_f2 r4, r5, r6, r7, r8
144+
sha_f2 r8, r4, r5, r6, r7
145+
sha_f2 r7, r8, r4, r5, r6
146+
sha_f2 r6, r7, r8, r4, r5
147+
sha_f2 r5, r6, r7, r8, r4
148+
bne 4b
149+
150+
ldr r1, .L_sha_K + 8
151+
mov lr, #4
152+
153+
5: subs lr, lr, #1
154+
sha_f3 r4, r5, r6, r7, r8
155+
sha_f3 r8, r4, r5, r6, r7
156+
sha_f3 r7, r8, r4, r5, r6
157+
sha_f3 r6, r7, r8, r4, r5
158+
sha_f3 r5, r6, r7, r8, r4
159+
bne 5b
160+
161+
ldr r1, .L_sha_K + 12
162+
mov lr, #4
163+
164+
6: subs lr, lr, #1
165+
sha_f2 r4, r5, r6, r7, r8
166+
sha_f2 r8, r4, r5, r6, r7
167+
sha_f2 r7, r8, r4, r5, r6
168+
sha_f2 r6, r7, r8, r4, r5
169+
sha_f2 r5, r6, r7, r8, r4
170+
bne 6b
171+
172+
ldmia r0, {r1, r2, r3, ip, lr}
173+
add r4, r1, r4
174+
add r5, r2, r5
175+
add r6, r3, r6, ror #2
176+
add r7, ip, r7, ror #2
177+
add r8, lr, r8, ror #2
178+
stmia r0, {r4 - r8}
179+
180+
ldmfd sp!, {r4 - r8, pc}
181+
182+
.L_sha_K:
183+
.word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
184+

0 commit comments

Comments
 (0)