Skip to content

Commit ab75c26

Browse files
committed
Adding support for CMake Install target and c++ consumers
1 parent 2061d6d commit ab75c26

File tree

4 files changed

+185
-74
lines changed

4 files changed

+185
-74
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,17 @@ x32/
3131
x64/
3232
x86/
3333
bld/
34+
build/
3435
[Bb]in/
3536
[Oo]bj/
3637
[Ll]og/
3738

3839
# Visual Studio 2015/2017 cache/options directory
3940
.vs/
41+
42+
# Visual Studio Code cache directory
43+
.vscode/
44+
4045
# Uncomment if you have tasks that create the project's static files in wwwroot
4146
#wwwroot/
4247

CMakeLists.txt

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ file(GLOB CLIENT_SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/blingfireclient.libra
5151
file(GLOB CLIENT_RESOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/blingfireclient.library/src/*.cxx")
5252

5353
file(GLOB COMPILE_HEADER_FILES "${CMAKE_CURRENT_SOURCE_DIR}/blingfirecompile.library/inc/*.h")
54-
file(GLOB COMPILE_RESOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/blingfirecompile.library/src/*.cxx")
5554
file(GLOB COMPILE_SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/blingfirecompile.library/src/*.cpp")
55+
file(GLOB COMPILE_RESOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/blingfirecompile.library/src/*.cxx")
5656

5757
# build blingfire client
5858
add_library(fsaClient STATIC ${CLIENT_HEADER_FILES} ${CLIENT_SOURCE_FILES} ${CLIENT_RESOURCE_FILES})
@@ -84,15 +84,24 @@ ENDMACRO()
8484
# find all tools dirs
8585
SUBDIRLIST(SUBDIRS ${CMAKE_CURRENT_SOURCE_DIR}/blingfiretools)
8686

87+
include(GNUInstallDirs)
88+
8789
# build tools
8890
FOREACH(dir ${SUBDIRS})
8991
get_filename_component(dirname ${dir} NAME)
9092
file(GLOB sourcefile ${CMAKE_CURRENT_SOURCE_DIR}/blingfiretools/${dir}/*.cpp)
9193
file(GLOB resourcefile ${CMAKE_CURRENT_SOURCE_DIR}/blingfiretools/${dir}/*.cxx)
9294
file(GLOB deffile ${CMAKE_CURRENT_SOURCE_DIR}/blingfiretools/${dir}/*.def)
9395
IF(${dirname} STREQUAL "blingfiretokdll")
94-
add_library(${dirname} SHARED ${sourcefile} ${resourcefile} ${deffile})
96+
file(GLOB headerfile ${CMAKE_CURRENT_SOURCE_DIR}/blingfiretools/${dir}/*.h)
97+
98+
add_library(${dirname} SHARED ${sourcefile} ${headerfile} ${resourcefile} ${deffile})
9599
add_library(${dirname}_static ${sourcefile} ${resourcefile} ${deffile})
100+
101+
INSTALL(
102+
FILES ${headerfile}
103+
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
104+
96105
target_link_libraries(${dirname} fsaClient)
97106
target_link_libraries(${dirname}_static fsaClient)
98107

@@ -111,5 +120,13 @@ FOREACH(dir ${SUBDIRS})
111120
target_link_libraries(${dirname} iconv)
112121
ENDIF()
113122
ENDIF()
114-
115123
ENDFOREACH()
124+
125+
INSTALL(
126+
TARGETS blingfiretokdll
127+
EXPORT blingfire-targets
128+
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
129+
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
130+
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
131+
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
132+
)

blingfiretools/blingfiretokdll/blingfiretokdll.cpp

Lines changed: 1 addition & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,7 @@
1313
#include "FAHyphInterpreter_core_t.h"
1414
#include "FAStringArray_pack.h"
1515

16-
#include <algorithm>
17-
#include <vector>
18-
#include <string>
19-
#include <sstream>
20-
#include <mutex>
21-
#include <assert.h>
16+
#include "blingfiretokdll.h"
2217

2318
/*
2419
This library provides easy interface to sentence and word-breaking functionality
@@ -48,58 +43,6 @@ const int WBD_IGNORE_TAG = 4;
4843
volatile bool g_fInitialized = false;
4944
std::mutex g_InitializationMutex; // this mutex is used once for default models only
5045

51-
// keep model data together
52-
struct FAModelData
53-
{
54-
// image of the loaded file
55-
FAImageDump m_Img;
56-
FALDB m_Ldb;
57-
58-
// data and const processor for tokenization
59-
FAWbdConfKeeper m_Conf;
60-
FALexTools_t < int > m_Engine;
61-
bool m_hasWbd;
62-
63-
// data and const processor for tokenization
64-
FADictConfKeeper m_DictConf;
65-
// indicates that a pos-dict data are present in the bin LDB file
66-
bool m_hasSeg;
67-
68-
// Unigram LM algorithm
69-
FATokenSegmentationTools_1best_t < int > m_SegEngine;
70-
// BPE (with separate merge ranks, ID's are ranks) runtime
71-
FATokenSegmentationTools_1best_bpe_t < int > m_SegEngineBpe;
72-
// BPE (with separate merge ranks) runtime
73-
FATokenSegmentationTools_1best_bpe_with_merges_t < int > m_SegEngineBpeWithMerges;
74-
// one selected algorithm for this bin file
75-
const FATokenSegmentationToolsCA_t < int > * m_pAlgo;
76-
// indicates wether characters are bytes of the UTF-8 rather than the Unicode symbols
77-
bool m_useRawBytes;
78-
79-
// Hyphenation / Syllabification data
80-
bool m_hasHy;
81-
FAHyphConfKeeper m_HyConf;
82-
FAHyphInterpreter_core_t < int > m_HyEngine;
83-
84-
// id2word lexicon data
85-
bool m_hasI2w;
86-
FAStringArray_pack m_i2w;
87-
int m_min_token_id; // min regular token id, needed to separate special tokens
88-
int m_max_token_id; // max regular token id, needed to separate special tokens
89-
90-
91-
FAModelData ():
92-
m_hasWbd (false),
93-
m_hasSeg (false),
94-
m_pAlgo (NULL),
95-
m_useRawBytes (false),
96-
m_hasHy (false),
97-
m_hasI2w (false),
98-
m_min_token_id (0),
99-
m_max_token_id (FALimits::MaxArrSize)
100-
{}
101-
};
102-
10346
#ifndef SIZE_OPTIMIZATION
10447
// keep two built-in models one for default WBD and one for default SBD
10548
FAModelData g_DefaultWbd;
@@ -139,19 +82,6 @@ void InitializeWbdSbd()
13982
}
14083
#endif
14184

142-
// SENTENCE PIECE DELIMITER
143-
#define __FASpDelimiter__ 0x2581
144-
145-
// DEFAULT HYPHEN
146-
#define __FADefaultHyphen__ 0x2012
147-
148-
// WHITESPACE [\x0004-\x0020\x007F-\x009F\x00A0\x2000-\x200F\x202F\x205F\x2060\x2420\x2424\x3000\xFEFF]
149-
#define __FAIsWhiteSpace__(C) ( \
150-
(C <= 0x20 || C == 0xa0 || (C >= 0x2000 && C <= 0x200f) || \
151-
C == 0x202f || C == 0x205f || C == 0x2060 || C == 0x2420 || \
152-
C == 0x2424 || C == 0x3000 || C == 0xfeff) \
153-
)
154-
15585

15686
inline int FAGetFirstNonWhiteSpace(int * pStr, const int StrLen)
15787
{
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
#include <algorithm>
2+
#include <vector>
3+
#include <string>
4+
#include <sstream>
5+
#include <mutex>
6+
#include <assert.h>
7+
8+
namespace BlingFire
9+
{
10+
11+
// keep model data together
12+
struct FAModelData
13+
{
14+
// image of the loaded file
15+
FAImageDump m_Img;
16+
FALDB m_Ldb;
17+
18+
// data and const processor for tokenization
19+
FAWbdConfKeeper m_Conf;
20+
FALexTools_t < int > m_Engine;
21+
bool m_hasWbd;
22+
23+
// data and const processor for tokenization
24+
FADictConfKeeper m_DictConf;
25+
// indicates that a pos-dict data are present in the bin LDB file
26+
bool m_hasSeg;
27+
28+
// Unigram LM algorithm
29+
FATokenSegmentationTools_1best_t < int > m_SegEngine;
30+
// BPE (with separate merge ranks, ID's are ranks) runtime
31+
FATokenSegmentationTools_1best_bpe_t < int > m_SegEngineBpe;
32+
// BPE (with separate merge ranks) runtime
33+
FATokenSegmentationTools_1best_bpe_with_merges_t < int > m_SegEngineBpeWithMerges;
34+
// one selected algorithm for this bin file
35+
const FATokenSegmentationToolsCA_t < int > * m_pAlgo;
36+
// indicates wether characters are bytes of the UTF-8 rather than the Unicode symbols
37+
bool m_useRawBytes;
38+
39+
// Hyphenation / Syllabification data
40+
bool m_hasHy;
41+
FAHyphConfKeeper m_HyConf;
42+
FAHyphInterpreter_core_t < int > m_HyEngine;
43+
44+
// id2word lexicon data
45+
bool m_hasI2w;
46+
FAStringArray_pack m_i2w;
47+
int m_min_token_id; // min regular token id, needed to separate special tokens
48+
int m_max_token_id; // max regular token id, needed to separate special tokens
49+
50+
51+
FAModelData ():
52+
m_hasWbd (false),
53+
m_hasSeg (false),
54+
m_pAlgo (NULL),
55+
m_useRawBytes (false),
56+
m_hasHy (false),
57+
m_hasI2w (false),
58+
m_min_token_id (0),
59+
m_max_token_id (FALimits::MaxArrSize)
60+
{}
61+
};
62+
63+
// SENTENCE PIECE DELIMITER
64+
#define __FASpDelimiter__ 0x2581
65+
66+
// DEFAULT HYPHEN
67+
#define __FADefaultHyphen__ 0x2012
68+
69+
// WHITESPACE [\x0004-\x0020\x007F-\x009F\x00A0\x2000-\x200F\x202F\x205F\x2060\x2420\x2424\x3000\xFEFF]
70+
#define __FAIsWhiteSpace__(C) ( \
71+
(C <= 0x20 || C == 0xa0 || (C >= 0x2000 && C <= 0x200f) || \
72+
C == 0x202f || C == 0x205f || C == 0x2060 || C == 0x2420 || \
73+
C == 0x2424 || C == 0x3000 || C == 0xfeff) \
74+
)
75+
76+
extern "C"
77+
{
78+
const int GetBlingFireTokVersion();
79+
const int TextToSentencesWithOffsetsWithModel(const char * pInUtf8Str, int InUtf8StrByteCount,
80+
char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, const int MaxOutUtf8StrByteCount,
81+
void * hModel);
82+
const int TextToSentencesWithOffsets(const char * pInUtf8Str, int InUtf8StrByteCount,
83+
char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, const int MaxOutUtf8StrByteCount);
84+
const int TextToSentencesWithModel(const char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, const int MaxOutUtf8StrByteCount, void * hModel);
85+
const int TextToSentences(const char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, const int MaxOutUtf8StrByteCount);
86+
const int TextToWordsWithOffsetsWithModel(const char * pInUtf8Str, int InUtf8StrByteCount,
87+
char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, const int MaxOutUtf8StrByteCount,
88+
void * hModel);
89+
const int TextToWordsWithOffsets(const char * pInUtf8Str, int InUtf8StrByteCount,
90+
char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, const int MaxOutUtf8StrByteCount);
91+
const int TextToWordsWithModel(const char * pInUtf8Str, int InUtf8StrByteCount,
92+
char * pOutUtf8Str, const int MaxOutUtf8StrByteCount, void * hModel);
93+
const int TextToWords(const char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, const int MaxOutUtf8StrByteCount);
94+
const int NormalizeSpaces(const char * pInUtf8Str, int InUtf8StrByteCount, char * pOutUtf8Str, const int MaxOutUtf8StrByteCount, const int uSpace = __FASpDelimiter__);
95+
const int TextToHashes(const char * pInUtf8Str, int InUtf8StrByteCount, int32_t * pHashArr, const int MaxHashArrLength, int wordNgrams, int bucketSize = 2000000);
96+
const int WordHyphenationWithModel(const char * pInUtf8Str, int InUtf8StrByteCount,
97+
char * pOutUtf8Str, const int MaxOutUtf8StrByteCount, void * hModel, const int uHy = __FADefaultHyphen__);
98+
void* SetModelData(FAModelData * pNewModelData, const unsigned char * pImgBytes);
99+
void* SetModel(const unsigned char * pImgBytes, int ModelByteCount);
100+
void* LoadModel(const char * pszLdbFileName);
101+
const int TextToIdsWithOffsets_wp(
102+
void* ModelPtr,
103+
const char * pInUtf8Str,
104+
int InUtf8StrByteCount,
105+
int32_t * pIdsArr,
106+
int * pStartOffsets,
107+
int * pEndOffsets,
108+
const int MaxIdsArrLength,
109+
const int UnkId = 0
110+
);
111+
const int TextToIds_wp(
112+
void* ModelPtr,
113+
const char * pInUtf8Str,
114+
int InUtf8StrByteCount,
115+
int32_t * pIdsArr,
116+
const int MaxIdsArrLength,
117+
const int UnkId = 0
118+
);
119+
const int TextToIdsWithOffsets_sp(
120+
void* ModelPtr,
121+
const char * pInUtf8Str,
122+
int InUtf8StrByteCount,
123+
int32_t * pIdsArr,
124+
int * pStartOffsets,
125+
int * pEndOffsets,
126+
const int MaxIdsArrLength,
127+
const int UnkId = 0
128+
);
129+
const int TextToIds_sp(
130+
void* ModelPtr,
131+
const char * pInUtf8Str,
132+
int InUtf8StrByteCount,
133+
int32_t * pIdsArr,
134+
const int MaxIdsArrLength,
135+
const int UnkId = 0
136+
);
137+
const int TextToIdsWithOffsets(
138+
void* ModelPtr,
139+
const char * pInUtf8Str,
140+
int InUtf8StrByteCount,
141+
int32_t * pIdsArr,
142+
int * pStartOffsets,
143+
int * pEndOffsets,
144+
const int MaxIdsArrLength,
145+
const int UnkId = 0
146+
);
147+
const int TextToIds(
148+
void* ModelPtr,
149+
const char * pInUtf8Str,
150+
int InUtf8StrByteCount,
151+
int32_t * pIdsArr,
152+
const int MaxIdsArrLength,
153+
const int UnkId = 0
154+
);
155+
int FreeModel(void* ModelPtr);
156+
int SetNoDummyPrefix(void* ModelPtr, bool fNoDummyPrefix);
157+
int IdsToText (void* ModelPtr, const int32_t * pIdsArr, const int IdsCount, char * pOutUtf8Str, const int MaxOutUtf8StrByteCount, bool SkipSpecialTokens);
158+
}
159+
}

0 commit comments

Comments
 (0)