purescript/src/Language/PureScript/PSString.hs at master · python6460/purescript

History

240 lines (209 loc) · 8 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

module Language.PureScript.PSString

( PSString

, toUTF16CodeUnits

, decodeString

, decodeStringEither

, decodeStringWithReplacement

, prettyPrintString

, prettyPrintStringJS

, mkString

) where

import Prelude

import GHC.Generics (Generic)

import Codec.Serialise (Serialise)

import Control.DeepSeq (NFData)

import Control.Exception (try, evaluate)

import Control.Applicative ((<|>))

import qualified Data.Char as Char

import Data.Bits (shiftR)

import Data.Either (fromRight)

import Data.List (unfoldr)

import Data.Scientific (toBoundedInteger)

import Data.String (IsString(..))

import Data.ByteString (ByteString)

import qualified Data.ByteString as BS

import Data.Text (Text)

import qualified Data.Text as T

import Data.Text.Encoding (decodeUtf16BE)

import Data.Text.Encoding.Error (UnicodeException)

import qualified Data.Vector as V

import Data.Word (Word16, Word8)

import Numeric (showHex)

import System.IO.Unsafe (unsafePerformIO)

import qualified Data.Aeson as A

import qualified Data.Aeson.Types as A

-- |

-- Strings in PureScript are sequences of UTF-16 code units, which do not

-- necessarily represent UTF-16 encoded text. For example, it is permissible

-- for a string to contain *lone surrogates,* i.e. characters in the range

-- U+D800 to U+DFFF which do not appear as a part of a surrogate pair.

-- The Show instance for PSString produces a string literal which would

-- represent the same data were it inserted into a PureScript source file.

-- Because JSON parsers vary wildly in terms of how they deal with lone

-- surrogates in JSON strings, the ToJSON instance for PSString produces JSON

-- strings where that would be safe (i.e. when there are no lone surrogates),

-- and arrays of UTF-16 code units (integers) otherwise.

newtype PSString = PSString { toUTF16CodeUnits :: [Word16] }

deriving (Eq, Ord, Semigroup, Monoid, Generic)

instance NFData PSString

instance Serialise PSString

instance Show PSString where

show = show . codePoints

-- |

-- Decode a PSString to a String, representing any lone surrogates as the

-- reserved code point with that index. Warning: if there are any lone

-- surrogates, converting the result to Text via Data.Text.pack will result in

-- loss of information as those lone surrogates will be replaced with U+FFFD

-- REPLACEMENT CHARACTER. Because this function requires care to use correctly,

-- we do not export it.

codePoints :: PSString -> String

codePoints = map (either (Char.chr . fromIntegral) id) . decodeStringEither

-- |

-- Decode a PSString as UTF-16 text. Lone surrogates will be replaced with

-- U+FFFD REPLACEMENT CHARACTER

decodeStringWithReplacement :: PSString -> String

decodeStringWithReplacement = map (fromRight '\xFFFD') . decodeStringEither

-- |

-- Decode a PSString as UTF-16. Lone surrogates in the input are represented in

-- the output with the Left constructor; characters which were successfully

-- decoded are represented with the Right constructor.

decodeStringEither :: PSString -> [Either Word16 Char]

decodeStringEither = unfoldr decode . toUTF16CodeUnits

where

decode :: [Word16] -> Maybe (Either Word16 Char, [Word16])

decode (h:l:rest) | isLead h && isTrail l = Just (Right (unsurrogate h l), rest)

decode (c:rest) | isSurrogate c = Just (Left c, rest)

decode (c:rest) = Just (Right (toChar c), rest)

decode [] = Nothing

unsurrogate :: Word16 -> Word16 -> Char

unsurrogate h l = toEnum ((toInt h - 0xD800) * 0x400 + (toInt l - 0xDC00) + 0x10000)

-- |

-- Attempt to decode a PSString as UTF-16 text. This will fail (returning

-- Nothing) if the argument contains lone surrogates.

decodeString :: PSString -> Maybe Text

decodeString = hush . decodeEither . BS.pack . concatMap unpair . toUTF16CodeUnits

where

unpair w = [highByte w, lowByte w]

lowByte :: Word16 -> Word8

lowByte = fromIntegral

highByte :: Word16 -> Word8

highByte = fromIntegral . (`shiftR` 8)

-- Based on a similar function from Data.Text.Encoding for utf8. This is a

-- safe usage of unsafePerformIO because there are no side effects after

-- handling any thrown UnicodeExceptions.

decodeEither :: ByteString -> Either UnicodeException Text

decodeEither = unsafePerformIO . try . evaluate . decodeUtf16BE

hush = either (const Nothing) Just

instance IsString PSString where

fromString a = PSString $ concatMap encodeUTF16 a

where

surrogates :: Char -> (Word16, Word16)

surrogates c = (toWord (h + 0xD800), toWord (l + 0xDC00))

where (h, l) = divMod (fromEnum c - 0x10000) 0x400

encodeUTF16 :: Char -> [Word16]

encodeUTF16 c | fromEnum c > 0xFFFF = [high, low]

where (high, low) = surrogates c

encodeUTF16 c = [toWord $ fromEnum c]

instance A.ToJSON PSString where

toJSON str =

case decodeString str of

Just t -> A.toJSON t

Nothing -> A.toJSON (toUTF16CodeUnits str)

instance A.FromJSON PSString where

parseJSON a = jsonString <|> arrayOfCodeUnits

where

jsonString = fromString <$> A.parseJSON a

arrayOfCodeUnits = PSString <$> parseArrayOfCodeUnits a

parseArrayOfCodeUnits :: A.Value -> A.Parser [Word16]

parseArrayOfCodeUnits = A.withArray "array of UTF-16 code units" (traverse parseCodeUnit . V.toList)

parseCodeUnit :: A.Value -> A.Parser Word16

parseCodeUnit b = A.withScientific "two-byte non-negative integer" (maybe (A.typeMismatch "" b) return . toBoundedInteger) b

-- |

-- Pretty print a PSString, using PureScript escape sequences.

prettyPrintString :: PSString -> Text

prettyPrintString s = "\"" <> foldMap encodeChar (decodeStringEither s) <> "\""

where

encodeChar :: Either Word16 Char -> Text

encodeChar (Left c) = "\\x" <> showHex' 6 c

encodeChar (Right c)

| c == '\t' = "\\t"

| c == '\r' = "\\r"

| c == '\n' = "\\n"

| c == '"' = "\\\""

| c == '\'' = "\\\'"

| c == '\\' = "\\\\"

| shouldPrint c = T.singleton c

| otherwise = "\\x" <> showHex' 6 (Char.ord c)

-- Note we do not use Data.Char.isPrint here because that includes things

-- like zero-width spaces and combining punctuation marks, which could be

-- confusing to print unescaped.

shouldPrint :: Char -> Bool

-- The standard space character, U+20 SPACE, is the only space char we should

-- print without escaping

shouldPrint ' ' = True

shouldPrint c =

Char.generalCategory c `elem`

[ Char.UppercaseLetter

, Char.LowercaseLetter

, Char.TitlecaseLetter

, Char.OtherLetter

, Char.DecimalNumber

, Char.LetterNumber

, Char.OtherNumber

, Char.ConnectorPunctuation

, Char.DashPunctuation

, Char.OpenPunctuation

, Char.ClosePunctuation

, Char.InitialQuote

, Char.FinalQuote

, Char.OtherPunctuation

, Char.MathSymbol

, Char.CurrencySymbol

, Char.ModifierSymbol

, Char.OtherSymbol

]

-- |

-- Pretty print a PSString, using JavaScript escape sequences. Intended for

-- use in compiled JS output.

prettyPrintStringJS :: PSString -> Text

prettyPrintStringJS s = "\"" <> foldMap encodeChar (toUTF16CodeUnits s) <> "\""

where

encodeChar :: Word16 -> Text

encodeChar c | c > 0xFF = "\\u" <> showHex' 4 c

encodeChar c | c > 0x7E || c < 0x20 = "\\x" <> showHex' 2 c

encodeChar c | toChar c == '\b' = "\\b"

encodeChar c | toChar c == '\t' = "\\t"

encodeChar c | toChar c == '\n' = "\\n"

encodeChar c | toChar c == '\v' = "\\v"

encodeChar c | toChar c == '\f' = "\\f"

encodeChar c | toChar c == '\r' = "\\r"

encodeChar c | toChar c == '"' = "\\\""

encodeChar c | toChar c == '\\' = "\\\\"

encodeChar c = T.singleton $ toChar c

showHex' :: Enum a => Int -> a -> Text

showHex' width c =

let hs = showHex (fromEnum c) "" in

T.pack (replicate (width - length hs) '0' <> hs)

isLead :: Word16 -> Bool

isLead h = h >= 0xD800 && h <= 0xDBFF

isTrail :: Word16 -> Bool

isTrail l = l >= 0xDC00 && l <= 0xDFFF

isSurrogate :: Word16 -> Bool

isSurrogate c = isLead c || isTrail c

toChar :: Word16 -> Char

toChar = toEnum . fromIntegral

toWord :: Int -> Word16

toWord = fromIntegral

toInt :: Word16 -> Int

toInt = fromIntegral

mkString :: Text -> PSString

mkString = fromString . T.unpack

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

PSString.hs

Latest commit

History

PSString.hs

File metadata and controls