Skip to content

Commit 87b2968

Browse files
committed
downcase_identifier(): use method table from locale provider.
Previously, libc's tolower() was always used for lowercasing identifiers, regardless of the database locale (though only characters beyond 127 in single-byte encodings were affected). Refactor to allow each provider to supply its own implementation of identifier downcasing. For historical compatibility, when using a single-byte encoding, ICU still relies on tolower(). One minor behavior change is that, before the database default locale is initialized, it uses ASCII semantics to downcase the identifiers. Previously, it would use the postmaster's LC_CTYPE setting from the environment. While that could have some effect during GUC processing, for example, it would have been fragile to rely on the environment setting anyway. (Also, it only matters when the encoding is single-byte.) Reviewed-by: Chao Li <li.evan.chao@gmail.com> Reviewed-by: Peter Eisentraut <peter@eisentraut.org> Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com
1 parent 7f007e4 commit 87b2968

File tree

6 files changed

+107
-25
lines changed

6 files changed

+107
-25
lines changed

src/backend/parser/scansup.c

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include "mb/pg_wchar.h"
2020
#include "parser/scansup.h"
21+
#include "utils/pg_locale.h"
2122

2223

2324
/*
@@ -46,35 +47,22 @@ char *
4647
downcase_identifier(const char *ident, int len, bool warn, bool truncate)
4748
{
4849
char *result;
49-
int i;
50-
bool enc_is_single_byte;
51-
52-
result = palloc(len + 1);
53-
enc_is_single_byte = pg_database_encoding_max_length() == 1;
50+
size_t needed pg_attribute_unused();
5451

5552
/*
56-
* SQL99 specifies Unicode-aware case normalization, which we don't yet
57-
* have the infrastructure for. Instead we use tolower() to provide a
58-
* locale-aware translation. However, there are some locales where this
59-
* is not right either (eg, Turkish may do strange things with 'i' and
60-
* 'I'). Our current compromise is to use tolower() for characters with
61-
* the high bit set, as long as they aren't part of a multi-byte
62-
* character, and use an ASCII-only downcasing for 7-bit characters.
53+
* Preserves string length.
54+
*
55+
* NB: if we decide to support Unicode-aware identifier case folding, then
56+
* we need to account for a change in string length.
6357
*/
64-
for (i = 0; i < len; i++)
65-
{
66-
unsigned char ch = (unsigned char) ident[i];
58+
result = palloc(len + 1);
6759

68-
if (ch >= 'A' && ch <= 'Z')
69-
ch += 'a' - 'A';
70-
else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
71-
ch = tolower(ch);
72-
result[i] = (char) ch;
73-
}
74-
result[i] = '\0';
60+
needed = pg_downcase_ident(result, len + 1, ident, len);
61+
Assert(needed == len);
62+
Assert(result[len] == '\0');
7563

76-
if (i >= NAMEDATALEN && truncate)
77-
truncate_identifier(result, i, warn);
64+
if (len >= NAMEDATALEN && truncate)
65+
truncate_identifier(result, len, warn);
7866

7967
return result;
8068
}

src/backend/utils/adt/pg_locale.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1352,6 +1352,26 @@ pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
13521352
return locale->ctype->strfold(dst, dstsize, src, srclen, locale);
13531353
}
13541354

1355+
/*
1356+
* Lowercase an identifier using the database default locale.
1357+
*
1358+
* For historical reasons, does not use ordinary locale behavior. Should only
1359+
* be used for identifiers. XXX: can we make this equivalent to
1360+
* pg_strfold(..., default_locale)?
1361+
*/
1362+
size_t
1363+
pg_downcase_ident(char *dst, size_t dstsize, const char *src, ssize_t srclen)
1364+
{
1365+
pg_locale_t locale = default_locale;
1366+
1367+
if (locale == NULL || locale->ctype == NULL ||
1368+
locale->ctype->downcase_ident == NULL)
1369+
return strlower_c(dst, dstsize, src, srclen);
1370+
else
1371+
return locale->ctype->downcase_ident(dst, dstsize, src, srclen,
1372+
locale);
1373+
}
1374+
13551375
/*
13561376
* pg_strcoll
13571377
*

src/backend/utils/adt/pg_locale_builtin.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,8 @@ static const struct ctype_methods ctype_methods_builtin = {
208208
.strtitle = strtitle_builtin,
209209
.strupper = strupper_builtin,
210210
.strfold = strfold_builtin,
211+
/* uses plain ASCII semantics for historical reasons */
212+
.downcase_ident = NULL,
211213
.wc_isdigit = wc_isdigit_builtin,
212214
.wc_isalpha = wc_isalpha_builtin,
213215
.wc_isalnum = wc_isalnum_builtin,

src/backend/utils/adt/pg_locale_icu.c

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ static size_t strupper_icu(char *dest, size_t destsize, const char *src,
6161
ssize_t srclen, pg_locale_t locale);
6262
static size_t strfold_icu(char *dest, size_t destsize, const char *src,
6363
ssize_t srclen, pg_locale_t locale);
64+
static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src,
65+
ssize_t srclen, pg_locale_t locale);
6466
static int strncoll_icu(const char *arg1, ssize_t len1,
6567
const char *arg2, ssize_t len2,
6668
pg_locale_t locale);
@@ -123,7 +125,7 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
123125

124126
/*
125127
* XXX: many of the functions below rely on casts directly from pg_wchar to
126-
* UChar32, which is correct for the UTF-8 encoding, but not in general.
128+
* UChar32, which is correct for UTF-8 and LATIN1, but not in general.
127129
*/
128130

129131
static pg_wchar
@@ -227,6 +229,7 @@ static const struct ctype_methods ctype_methods_icu = {
227229
.strtitle = strtitle_icu,
228230
.strupper = strupper_icu,
229231
.strfold = strfold_icu,
232+
.downcase_ident = downcase_ident_icu,
230233
.wc_isdigit = wc_isdigit_icu,
231234
.wc_isalpha = wc_isalpha_icu,
232235
.wc_isalnum = wc_isalnum_icu,
@@ -564,6 +567,37 @@ strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
564567
return result_len;
565568
}
566569

570+
/*
571+
* For historical compatibility, behavior is not multibyte-aware.
572+
*
573+
* NB: uses libc tolower() for single-byte encodings (also for historical
574+
* compatibility), and therefore relies on the global LC_CTYPE setting.
575+
*/
576+
static size_t
577+
downcase_ident_icu(char *dst, size_t dstsize, const char *src,
578+
ssize_t srclen, pg_locale_t locale)
579+
{
580+
int i;
581+
bool enc_is_single_byte;
582+
583+
enc_is_single_byte = pg_database_encoding_max_length() == 1;
584+
for (i = 0; i < srclen && i < dstsize; i++)
585+
{
586+
unsigned char ch = (unsigned char) src[i];
587+
588+
if (ch >= 'A' && ch <= 'Z')
589+
ch = pg_ascii_tolower(ch);
590+
else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
591+
ch = tolower(ch);
592+
dst[i] = (char) ch;
593+
}
594+
595+
if (i < dstsize)
596+
dst[i] = '\0';
597+
598+
return srclen;
599+
}
600+
567601
/*
568602
* strncoll_icu_utf8
569603
*

src/backend/utils/adt/pg_locale_libc.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,12 +318,41 @@ tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
318318
return wc;
319319
}
320320

321+
/*
322+
* Characters A..Z always downcase to a..z, even in the Turkish
323+
* locale. Characters beyond 127 use tolower().
324+
*/
325+
static size_t
326+
downcase_ident_libc_sb(char *dst, size_t dstsize, const char *src,
327+
ssize_t srclen, pg_locale_t locale)
328+
{
329+
locale_t loc = locale->lt;
330+
int i;
331+
332+
for (i = 0; i < srclen && i < dstsize; i++)
333+
{
334+
unsigned char ch = (unsigned char) src[i];
335+
336+
if (ch >= 'A' && ch <= 'Z')
337+
ch = pg_ascii_tolower(ch);
338+
else if (IS_HIGHBIT_SET(ch) && isupper_l(ch, loc))
339+
ch = tolower_l(ch, loc);
340+
dst[i] = (char) ch;
341+
}
342+
343+
if (i < dstsize)
344+
dst[i] = '\0';
345+
346+
return srclen;
347+
}
348+
321349
static const struct ctype_methods ctype_methods_libc_sb = {
322350
.strlower = strlower_libc_sb,
323351
.strtitle = strtitle_libc_sb,
324352
.strupper = strupper_libc_sb,
325353
/* in libc, casefolding is the same as lowercasing */
326354
.strfold = strlower_libc_sb,
355+
.downcase_ident = downcase_ident_libc_sb,
327356
.wc_isdigit = wc_isdigit_libc_sb,
328357
.wc_isalpha = wc_isalpha_libc_sb,
329358
.wc_isalnum = wc_isalnum_libc_sb,
@@ -349,6 +378,8 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
349378
.strupper = strupper_libc_mb,
350379
/* in libc, casefolding is the same as lowercasing */
351380
.strfold = strlower_libc_mb,
381+
/* uses plain ASCII semantics for historical reasons */
382+
.downcase_ident = NULL,
352383
.wc_isdigit = wc_isdigit_libc_sb,
353384
.wc_isalpha = wc_isalpha_libc_sb,
354385
.wc_isalnum = wc_isalnum_libc_sb,
@@ -370,6 +401,8 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
370401
.strupper = strupper_libc_mb,
371402
/* in libc, casefolding is the same as lowercasing */
372403
.strfold = strlower_libc_mb,
404+
/* uses plain ASCII semantics for historical reasons */
405+
.downcase_ident = NULL,
373406
.wc_isdigit = wc_isdigit_libc_mb,
374407
.wc_isalpha = wc_isalpha_libc_mb,
375408
.wc_isalnum = wc_isalnum_libc_mb,

src/include/utils/pg_locale.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ struct ctype_methods
110110
size_t (*strfold) (char *dest, size_t destsize,
111111
const char *src, ssize_t srclen,
112112
pg_locale_t locale);
113+
size_t (*downcase_ident) (char *dest, size_t destsize,
114+
const char *src, ssize_t srclen,
115+
pg_locale_t locale);
113116

114117
/* required */
115118
bool (*wc_isdigit) (pg_wchar wc, pg_locale_t locale);
@@ -187,6 +190,8 @@ extern size_t pg_strupper(char *dst, size_t dstsize,
187190
extern size_t pg_strfold(char *dst, size_t dstsize,
188191
const char *src, ssize_t srclen,
189192
pg_locale_t locale);
193+
extern size_t pg_downcase_ident(char *dst, size_t dstsize,
194+
const char *src, ssize_t srclen);
190195
extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
191196
extern int pg_strncoll(const char *arg1, ssize_t len1,
192197
const char *arg2, ssize_t len2, pg_locale_t locale);

0 commit comments

Comments
 (0)