Skip to content

Commit 9fef11a

Browse files
committed
add quantinizer
1 parent 4c0ca67 commit 9fef11a

File tree

3 files changed

+100
-1
lines changed

3 files changed

+100
-1
lines changed

README.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Table of Contents
1313
- [Environment Requirements](#environment-requirements)
1414
- [Model Weights](#model-weights)
1515
- [Inference Example](#inference-example)
16+
- [Quantized through bitsandbytes](#quantized-through-bitsandbytes)
1617
3. [Data for aiXcoder 7B](#data-for-aixcoder-7b)
1718
4. [Training](#training)
1819
- [Training Hyperparameters](#training-hyperparameters)
@@ -249,6 +250,54 @@ def quick_sort(arr):
249250

250251
```
251252

253+
### Quantized through bitsandbytes
254+
255+
We can also install Bitsandbytes through `pip install bitsandbytes Acceleration`, and simply add configuration to perform int8 or int4 inference (if you need to further compress the temporary memory applied at runtime, it is recommended to install FlashAttention):
256+
257+
```python
258+
259+
import sys
260+
import torch
261+
from hf_mini.utils import input_wrapper
262+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
263+
264+
# to use 4bit use `load_in_4bit=True` instead
265+
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
266+
267+
device = "cuda" # the device to load the model onto
268+
269+
tokenizer = AutoTokenizer.from_pretrained("aiXcoder/aixcoder-7b-base")
270+
model = AutoModelForCausalLM.from_pretrained("aiXcoder/aixcoder-7b-base", quantization_config=bnb_config, device_map=device, attn_implementation='flash_attention_2')
271+
272+
text = input_wrapper(
273+
code_string="# 快速排序算法",
274+
later_code="\n",
275+
path="test.py"
276+
)
277+
278+
if len(text) == 0:
279+
sys.exit()
280+
281+
inputs = tokenizer(text, return_tensors="pt", return_token_type_ids=False)
282+
283+
inputs = inputs.to(device)
284+
285+
outputs = model.generate(**inputs, max_new_tokens=256)
286+
print(f"Model memory footprint: {model.get_memory_footprint() / 2**20:.2f} MB")
287+
print(f"Torch max memory allocated: {torch.cuda.max_memory_allocated() / 2**20:.2f} MB")
288+
289+
"""
290+
load_in_4bit=True:
291+
- Model memory footprint: 5656.52 MB
292+
- Torch max memory allocated: 6448.89 MB
293+
294+
load_in_8bit=True:
295+
- Model memory footprint: 9008.52 MB
296+
- Torch max memory allocated: 10061.51 MB
297+
"""
298+
299+
```
300+
252301

253302
## Data for aiXcoder 7B
254303

README_CN.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
- [运行环境](#运行环境)
1414
- [模型权重](#模型权重)
1515
- [推理示例](#推理示例)
16+
- [Bitsandbytes 量化执行](#bitsandbytes-量化执行)
1617
3. [aiXcoder 7B 训练数据](#aixcoder-7b-训练数据)
1718
4. [训练](#训练)
1819
- [训练超参数](#训练超参数)
@@ -240,6 +241,53 @@ def quick_sort(arr):
240241
quick_sort(arr[left + 1:])
241242
return arr</s>
242243
"""
244+
```
245+
246+
### Bitsandbytes 量化执行
247+
248+
我们也能通过 `pip install bitsandbytes accelerate` 安装 Bitsandbytes,并简单加加上配置项执行int8或int4的量化推理(如果需要进一步压缩运行时申请的临时显存,推荐安装 FlashAttention):
249+
250+
```python
251+
252+
import sys
253+
import torch
254+
from hf_mini.utils import input_wrapper
255+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
256+
257+
# to use 4bit use `load_in_4bit=True` instead
258+
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
259+
260+
device = "cuda" # the device to load the model onto
261+
262+
tokenizer = AutoTokenizer.from_pretrained("aiXcoder/aixcoder-7b-base")
263+
model = AutoModelForCausalLM.from_pretrained("aiXcoder/aixcoder-7b-base", quantization_config=bnb_config, device_map=device, attn_implementation='flash_attention_2')
264+
265+
text = input_wrapper(
266+
code_string="# 快速排序算法",
267+
later_code="\n",
268+
path="test.py"
269+
)
270+
271+
if len(text) == 0:
272+
sys.exit()
273+
274+
inputs = tokenizer(text, return_tensors="pt", return_token_type_ids=False)
275+
276+
inputs = inputs.to(device)
277+
278+
outputs = model.generate(**inputs, max_new_tokens=256)
279+
print(f"Model memory footprint: {model.get_memory_footprint() / 2**20:.2f} MB")
280+
print(f"Torch max memory allocated: {torch.cuda.max_memory_allocated() / 2**20:.2f} MB")
281+
282+
"""
283+
load_in_4bit=True:
284+
- Model memory footprint: 5656.52 MB
285+
- Torch max memory allocated: 6448.89 MB
286+
287+
load_in_8bit=True:
288+
- Model memory footprint: 9008.52 MB
289+
- Torch max memory allocated: 10061.51 MB
290+
"""
243291

244292
```
245293

hf_mini/utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1090,17 +1090,19 @@
10901090

10911091

10921092
import re
1093+
import time
10931094
from hf_mini.filter import SensitiveInforRM
10941095
is_security = SensitiveInforRM()
10951096

10961097
def input_wrapper(code_string, later_code: str = "", path: str = "") -> str:
10971098

1099+
start = time.time()
10981100
_sequerity = True
10991101
for i in [code_string, later_code, path]:
11001102
if not is_security.is_security(i):
11011103
_sequerity = False
11021104
break
1103-
1105+
print(f"Done inputs checking with {(time.time()-start) * 1000:.2f}ms", flush=True)
11041106
if not _sequerity:
11051107
return ""
11061108

0 commit comments

Comments
 (0)