1+ #include < algorithm>
2+ #include < vector>
3+ #include < string>
4+ #include < sstream>
5+ #include < mutex>
6+ #include < assert.h>
7+
8+ namespace BlingFire
9+ {
10+
11+ // keep model data together
12+ struct FAModelData
13+ {
14+ // image of the loaded file
15+ FAImageDump m_Img;
16+ FALDB m_Ldb;
17+
18+ // data and const processor for tokenization
19+ FAWbdConfKeeper m_Conf;
20+ FALexTools_t < int > m_Engine;
21+ bool m_hasWbd;
22+
23+ // data and const processor for tokenization
24+ FADictConfKeeper m_DictConf;
25+ // indicates that a pos-dict data are present in the bin LDB file
26+ bool m_hasSeg;
27+
28+ // Unigram LM algorithm
29+ FATokenSegmentationTools_1best_t < int > m_SegEngine;
30+ // BPE (with separate merge ranks, ID's are ranks) runtime
31+ FATokenSegmentationTools_1best_bpe_t < int > m_SegEngineBpe;
32+ // BPE (with separate merge ranks) runtime
33+ FATokenSegmentationTools_1best_bpe_with_merges_t < int > m_SegEngineBpeWithMerges;
34+ // one selected algorithm for this bin file
35+ const FATokenSegmentationToolsCA_t < int > * m_pAlgo;
36+ // indicates wether characters are bytes of the UTF-8 rather than the Unicode symbols
37+ bool m_useRawBytes;
38+
39+ // Hyphenation / Syllabification data
40+ bool m_hasHy;
41+ FAHyphConfKeeper m_HyConf;
42+ FAHyphInterpreter_core_t < int > m_HyEngine;
43+
44+ // id2word lexicon data
45+ bool m_hasI2w;
46+ FAStringArray_pack m_i2w;
47+ int m_min_token_id; // min regular token id, needed to separate special tokens
48+ int m_max_token_id; // max regular token id, needed to separate special tokens
49+
50+
51+ FAModelData ():
52+ m_hasWbd (false ),
53+ m_hasSeg (false ),
54+ m_pAlgo (NULL ),
55+ m_useRawBytes (false ),
56+ m_hasHy (false ),
57+ m_hasI2w (false ),
58+ m_min_token_id (0 ),
59+ m_max_token_id (FALimits::MaxArrSize)
60+ {}
61+ };
62+
63+ // SENTENCE PIECE DELIMITER
64+ #define __FASpDelimiter__ 0x2581
65+
66+ // DEFAULT HYPHEN
67+ #define __FADefaultHyphen__ 0x2012
68+
69+ // WHITESPACE [\x0004-\x0020\x007F-\x009F\x00A0\x2000-\x200F\x202F\x205F\x2060\x2420\x2424\x3000\xFEFF]
70+ #define __FAIsWhiteSpace__ (C ) ( \
71+ (C <= 0x20 || C == 0xa0 || (C >= 0x2000 && C <= 0x200f ) || \
72+ C == 0x202f || C == 0x205f || C == 0x2060 || C == 0x2420 || \
73+ C == 0x2424 || C == 0x3000 || C == 0xfeff ) \
74+ )
75+
76+ extern " C"
77+ {
78+ const int GetBlingFireTokVersion ();
79+ const int TextToSentencesWithOffsetsWithModel (const char * pInUtf8Str, int InUtf8StrByteCount,
80+ char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, const int MaxOutUtf8StrByteCount,
81+ void * hModel);
82+ const int TextToSentencesWithOffsets (const char * pInUtf8Str, int InUtf8StrByteCount,
83+ char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, const int MaxOutUtf8StrByteCount);
84+ const int TextToSentencesWithModel (const char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, const int MaxOutUtf8StrByteCount, void * hModel);
85+ const int TextToSentences (const char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, const int MaxOutUtf8StrByteCount);
86+ const int TextToWordsWithOffsetsWithModel (const char * pInUtf8Str, int InUtf8StrByteCount,
87+ char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, const int MaxOutUtf8StrByteCount,
88+ void * hModel);
89+ const int TextToWordsWithOffsets (const char * pInUtf8Str, int InUtf8StrByteCount,
90+ char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, const int MaxOutUtf8StrByteCount);
91+ const int TextToWordsWithModel (const char * pInUtf8Str, int InUtf8StrByteCount,
92+ char * pOutUtf8Str, const int MaxOutUtf8StrByteCount, void * hModel);
93+ const int TextToWords (const char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, const int MaxOutUtf8StrByteCount);
94+ const int NormalizeSpaces (const char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, const int MaxOutUtf8StrByteCount, const int uSpace = __FASpDelimiter__);
95+ const int TextToHashes (const char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pHashArr, const int MaxHashArrLength, int wordNgrams, int bucketSize = 2000000 );
96+ const int WordHyphenationWithModel (const char * pInUtf8Str, int InUtf8StrByteCount,
97+ char * pOutUtf8Str, const int MaxOutUtf8StrByteCount, void * hModel, const int uHy = __FADefaultHyphen__);
98+ void * SetModelData (FAModelData * pNewModelData, const unsigned char * pImgBytes);
99+ void * SetModel (const unsigned char * pImgBytes, int ModelByteCount);
100+ void * LoadModel (const char * pszLdbFileName);
101+ const int TextToIdsWithOffsets_wp (
102+ void * ModelPtr,
103+ const char * pInUtf8Str,
104+ int InUtf8StrByteCount,
105+ int32_t * pIdsArr,
106+ int * pStartOffsets,
107+ int * pEndOffsets,
108+ const int MaxIdsArrLength,
109+ const int UnkId = 0
110+ );
111+ const int TextToIds_wp (
112+ void * ModelPtr,
113+ const char * pInUtf8Str,
114+ int InUtf8StrByteCount,
115+ int32_t * pIdsArr,
116+ const int MaxIdsArrLength,
117+ const int UnkId = 0
118+ );
119+ const int TextToIdsWithOffsets_sp (
120+ void * ModelPtr,
121+ const char * pInUtf8Str,
122+ int InUtf8StrByteCount,
123+ int32_t * pIdsArr,
124+ int * pStartOffsets,
125+ int * pEndOffsets,
126+ const int MaxIdsArrLength,
127+ const int UnkId = 0
128+ );
129+ const int TextToIds_sp (
130+ void * ModelPtr,
131+ const char * pInUtf8Str,
132+ int InUtf8StrByteCount,
133+ int32_t * pIdsArr,
134+ const int MaxIdsArrLength,
135+ const int UnkId = 0
136+ );
137+ const int TextToIdsWithOffsets (
138+ void * ModelPtr,
139+ const char * pInUtf8Str,
140+ int InUtf8StrByteCount,
141+ int32_t * pIdsArr,
142+ int * pStartOffsets,
143+ int * pEndOffsets,
144+ const int MaxIdsArrLength,
145+ const int UnkId = 0
146+ );
147+ const int TextToIds (
148+ void * ModelPtr,
149+ const char * pInUtf8Str,
150+ int InUtf8StrByteCount,
151+ int32_t * pIdsArr,
152+ const int MaxIdsArrLength,
153+ const int UnkId = 0
154+ );
155+ int FreeModel (void * ModelPtr);
156+ int SetNoDummyPrefix (void * ModelPtr, bool fNoDummyPrefix );
157+ int IdsToText (void * ModelPtr, const int32_t * pIdsArr, const int IdsCount, char * pOutUtf8Str, const int MaxOutUtf8StrByteCount, bool SkipSpecialTokens);
158+ }
159+ }
0 commit comments