Skip to content

Commit a50d201

Browse files
author
Fredrik Lundh
committed
needforspeed: stringlib refactoring (in progress)
1 parent 877ab9b commit a50d201

4 files changed

Lines changed: 111 additions & 179 deletions

File tree

Objects/stringlib/README.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
bits shared by the stringobject and unicodeobject implementations (and
2+
possibly other modules, in a not too distant future).
3+
4+
the stuff in here is included into relevant places; see the individual
5+
source files for details.

Objects/stringlib/fastsearch.h

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/* stringlib: fastsearch implementation */
2+
3+
#ifndef STRINGLIB_FASTSEARCH_H
4+
#define STRINGLIB_FASTSEARCH_H
5+
6+
/* fast search/count implementation, based on a mix between boyer-
7+
moore and horspool, with a few more bells and whistles on the top.
8+
for some more background, see: http://effbot.org/stringlib */
9+
10+
/* note: fastsearch may access s[n], which isn't a problem when using
11+
Python's ordinary string types, but may cause problems if you're
12+
using this code in other contexts. also, the count mode returns -1
13+
if there cannot possible be a match in the target string, and 0 if
14+
it has actually checked for matches, but didn't find any. callers
15+
beware! */
16+
17+
#define FAST_COUNT 0
18+
#define FAST_SEARCH 1
19+
20+
Py_LOCAL(Py_ssize_t)
21+
fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
22+
const STRINGLIB_CHAR* p, Py_ssize_t m,
23+
int mode)
24+
{
25+
long mask;
26+
Py_ssize_t skip, count = 0;
27+
Py_ssize_t i, j, mlast, w;
28+
29+
w = n - m;
30+
31+
if (w < 0)
32+
return -1;
33+
34+
/* look for special cases */
35+
if (m <= 1) {
36+
if (m <= 0)
37+
return -1;
38+
/* use special case for 1-character strings */
39+
if (mode == FAST_COUNT) {
40+
for (i = 0; i < n; i++)
41+
if (s[i] == p[0])
42+
count++;
43+
return count;
44+
} else {
45+
for (i = 0; i < n; i++)
46+
if (s[i] == p[0])
47+
return i;
48+
}
49+
return -1;
50+
}
51+
52+
mlast = m - 1;
53+
54+
/* create compressed boyer-moore delta 1 table */
55+
skip = mlast - 1;
56+
/* process pattern[:-1] */
57+
for (mask = i = 0; i < mlast; i++) {
58+
mask |= (1 << (p[i] & 0x1F));
59+
if (p[i] == p[mlast])
60+
skip = mlast - i - 1;
61+
}
62+
/* process pattern[-1] outside the loop */
63+
mask |= (1 << (p[mlast] & 0x1F));
64+
65+
for (i = 0; i <= w; i++) {
66+
/* note: using mlast in the skip path slows things down on x86 */
67+
if (s[i+m-1] == p[m-1]) {
68+
/* candidate match */
69+
for (j = 0; j < mlast; j++)
70+
if (s[i+j] != p[j])
71+
break;
72+
if (j == mlast) {
73+
/* got a match! */
74+
if (mode != FAST_COUNT)
75+
return i;
76+
count++;
77+
i = i + mlast;
78+
continue;
79+
}
80+
/* miss: check if next character is part of pattern */
81+
if (!(mask & (1 << (s[i+m] & 0x1F))))
82+
i = i + m;
83+
else
84+
i = i + skip;
85+
} else {
86+
/* skip: check if next character is part of pattern */
87+
if (!(mask & (1 << (s[i+m] & 0x1F))))
88+
i = i + m;
89+
}
90+
}
91+
92+
if (mode != FAST_COUNT)
93+
return -1;
94+
return count;
95+
}
96+
97+
#endif

Objects/stringobject.c

Lines changed: 7 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -765,102 +765,17 @@ PyString_AsStringAndSize(register PyObject *obj,
765765
}
766766

767767
/* -------------------------------------------------------------------- */
768-
/* Helpers */
768+
/* stringlib components */
769769

770-
#define USE_FAST /* experimental fast search implementation */
770+
#define USE_FAST
771771

772-
/* XXX - this code is copied from unicodeobject.c. we really should
773-
refactor the core implementations (see _sre.c for how this can be
774-
done), but that'll have to wait -- fredrik */
775-
776-
/* fast search/count implementation, based on a mix between boyer-
777-
moore and horspool, with a few more bells and whistles on the top.
778-
for some more background, see: http://effbot.org/stringlib */
779-
780-
/* note: fastsearch may access s[n], which isn't a problem when using
781-
Python's ordinary string types, but may cause problems if you're
782-
using this code in other contexts. also, the count mode returns -1
783-
if there cannot possibly be a match in the target string, and 0 if
784-
it has actually checked for matches, but didn't find any. callers
785-
beware! */
786-
787-
#define FAST_COUNT 0
788-
#define FAST_SEARCH 1
789-
790-
Py_LOCAL(Py_ssize_t)
791-
fastsearch(const char* s, Py_ssize_t n, const char* p, Py_ssize_t m, int mode)
792-
{
793-
long mask;
794-
Py_ssize_t skip, count = 0;
795-
Py_ssize_t i, j, mlast, w;
796-
797-
w = n - m;
798-
799-
if (w < 0)
800-
return -1;
772+
#ifdef USE_FAST
801773

802-
/* look for special cases */
803-
if (m <= 1) {
804-
if (m <= 0)
805-
return -1;
806-
/* use special case for 1-character strings */
807-
if (mode == FAST_COUNT) {
808-
for (i = 0; i < n; i++)
809-
if (s[i] == p[0])
810-
count++;
811-
return count;
812-
} else {
813-
for (i = 0; i < n; i++)
814-
if (s[i] == p[0])
815-
return i;
816-
}
817-
return -1;
818-
}
774+
#define STRINGLIB_CHAR char
819775

820-
mlast = m - 1;
821-
822-
/* create compressed boyer-moore delta 1 table */
823-
skip = mlast - 1;
824-
/* process pattern[:-1] */
825-
for (mask = i = 0; i < mlast; i++) {
826-
mask |= (1 << (p[i] & 0x1F));
827-
if (p[i] == p[mlast])
828-
skip = mlast - i - 1;
829-
}
830-
/* process pattern[-1] outside the loop */
831-
mask |= (1 << (p[mlast] & 0x1F));
832-
833-
for (i = 0; i <= w; i++) {
834-
/* note: using mlast in the skip path slows things down on x86 */
835-
if (s[i+m-1] == p[m-1]) {
836-
/* candidate match */
837-
for (j = 0; j < mlast; j++)
838-
if (s[i+j] != p[j])
839-
break;
840-
if (j == mlast) {
841-
/* got a match! */
842-
if (mode != FAST_COUNT)
843-
return i;
844-
count++;
845-
i = i + mlast;
846-
continue;
847-
}
848-
/* miss: check if next character is part of pattern */
849-
if (!(mask & (1 << (s[i+m] & 0x1F))))
850-
i = i + m;
851-
else
852-
i = i + skip;
853-
} else {
854-
/* skip: check if next character is part of pattern */
855-
if (!(mask & (1 << (s[i+m] & 0x1F))))
856-
i = i + m;
857-
}
858-
}
776+
#include "stringlib/fastsearch.h"
859777

860-
if (mode != FAST_COUNT)
861-
return -1;
862-
return count;
863-
}
778+
#endif
864779

865780
/* -------------------------------------------------------------------- */
866781
/* Methods */
@@ -2416,7 +2331,7 @@ string_count(PyStringObject *self, PyObject *args)
24162331
#else
24172332
r = 0;
24182333
while (i < m) {
2419-
const char *t
2334+
const char *t;
24202335
if (!memcmp(s+i, sub, n)) {
24212336
r++;
24222337
i += n;

Objects/unicodeobject.c

Lines changed: 2 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -3854,94 +3854,9 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
38543854

38553855
/* --- Helpers ------------------------------------------------------------ */
38563856

3857-
/* fast search/count implementation, based on a mix between boyer-
3858-
moore and horspool, with a few more bells and whistles on the top.
3859-
for some more background, see: http://effbot.org/stringlib */
3857+
#define STRINGLIB_CHAR Py_UNICODE
38603858

3861-
/* note: fastsearch may access s[n], which isn't a problem when using
3862-
Python's ordinary string types, but may cause problems if you're
3863-
using this code in other contexts. also, the count mode returns -1
3864-
if there cannot possible be a match in the target string, and 0 if
3865-
it has actually checked for matches, but didn't find any. callers
3866-
beware! */
3867-
3868-
#define FAST_COUNT 0
3869-
#define FAST_SEARCH 1
3870-
3871-
Py_LOCAL(Py_ssize_t)
3872-
fastsearch(Py_UNICODE* s, Py_ssize_t n, Py_UNICODE* p, Py_ssize_t m, int mode)
3873-
{
3874-
long mask;
3875-
Py_ssize_t skip, count = 0;
3876-
Py_ssize_t i, j, mlast, w;
3877-
3878-
w = n - m;
3879-
3880-
if (w < 0)
3881-
return -1;
3882-
3883-
/* look for special cases */
3884-
if (m <= 1) {
3885-
if (m <= 0)
3886-
return -1;
3887-
/* use special case for 1-character strings */
3888-
if (mode == FAST_COUNT) {
3889-
for (i = 0; i < n; i++)
3890-
if (s[i] == p[0])
3891-
count++;
3892-
return count;
3893-
} else {
3894-
for (i = 0; i < n; i++)
3895-
if (s[i] == p[0])
3896-
return i;
3897-
}
3898-
return -1;
3899-
}
3900-
3901-
mlast = m - 1;
3902-
3903-
/* create compressed boyer-moore delta 1 table */
3904-
skip = mlast - 1;
3905-
/* process pattern[:-1] */
3906-
for (mask = i = 0; i < mlast; i++) {
3907-
mask |= (1 << (p[i] & 0x1F));
3908-
if (p[i] == p[mlast])
3909-
skip = mlast - i - 1;
3910-
}
3911-
/* process pattern[-1] outside the loop */
3912-
mask |= (1 << (p[mlast] & 0x1F));
3913-
3914-
for (i = 0; i <= w; i++) {
3915-
/* note: using mlast in the skip path slows things down on x86 */
3916-
if (s[i+m-1] == p[m-1]) {
3917-
/* candidate match */
3918-
for (j = 0; j < mlast; j++)
3919-
if (s[i+j] != p[j])
3920-
break;
3921-
if (j == mlast) {
3922-
/* got a match! */
3923-
if (mode != FAST_COUNT)
3924-
return i;
3925-
count++;
3926-
i = i + mlast;
3927-
continue;
3928-
}
3929-
/* miss: check if next character is part of pattern */
3930-
if (!(mask & (1 << (s[i+m] & 0x1F))))
3931-
i = i + m;
3932-
else
3933-
i = i + skip;
3934-
} else {
3935-
/* skip: check if next character is part of pattern */
3936-
if (!(mask & (1 << (s[i+m] & 0x1F))))
3937-
i = i + m;
3938-
}
3939-
}
3940-
3941-
if (mode != FAST_COUNT)
3942-
return -1;
3943-
return count;
3944-
}
3859+
#include "stringlib/fastsearch.h"
39453860

39463861
Py_LOCAL(Py_ssize_t) count(PyUnicodeObject *self,
39473862
Py_ssize_t start,

0 commit comments

Comments
 (0)