add quantinizer

HoratioJSY · HoratioJSY · commit 9fef11ad8aeb · 2024-04-11T17:12:35.000+08:00
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ Table of Contents
     - [Environment Requirements](#environment-requirements)
     - [Model Weights](#model-weights)
     - [Inference Example](#inference-example)
+    - [Quantized through bitsandbytes](#quantized-through-bitsandbytes)
 3. [Data for aiXcoder 7B](#data-for-aixcoder-7b)
 4. [Training](#training)
     - [Training Hyperparameters](#training-hyperparameters)
@@ -249,6 +250,54 @@ def quick_sort(arr):
 
 ```
 
+### Quantized through bitsandbytes
+
+We can also install Bitsandbytes through `pip install bitsandbytes Acceleration`, and simply add configuration to perform int8 or int4 inference (if you need to further compress the temporary memory applied at runtime, it is recommended to install FlashAttention):
+
+```python
+
+import sys
+import torch
+from hf_mini.utils import input_wrapper
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig    
+
+# to use 4bit use `load_in_4bit=True` instead
+bnb_config = BitsAndBytesConfig(load_in_8bit=True) 
+
+device = "cuda" # the device to load the model onto
+
+tokenizer = AutoTokenizer.from_pretrained("aiXcoder/aixcoder-7b-base")
+model = AutoModelForCausalLM.from_pretrained("aiXcoder/aixcoder-7b-base", quantization_config=bnb_config, device_map=device, attn_implementation='flash_attention_2')
+
+text = input_wrapper(
+    code_string="# 快速排序算法",
+    later_code="\n",
+    path="test.py"
+)
+
+if len(text) == 0:
+    sys.exit()
+
+inputs = tokenizer(text, return_tensors="pt", return_token_type_ids=False)
+
+inputs = inputs.to(device)    
+
+outputs = model.generate(**inputs, max_new_tokens=256)
+print(f"Model memory footprint: {model.get_memory_footprint() / 2**20:.2f} MB")
+print(f"Torch max memory allocated: {torch.cuda.max_memory_allocated() / 2**20:.2f} MB")
+
+"""
+load_in_4bit=True:
+    - Model memory footprint: 5656.52 MB
+    - Torch max memory allocated: 6448.89 MB
+
+load_in_8bit=True:
+    - Model memory footprint: 9008.52 MB
+    - Torch max memory allocated: 10061.51 MB
+"""
+
+```
+
 
 ## Data for aiXcoder 7B
 
diff --git a/README_CN.md b/README_CN.md
@@ -13,6 +13,7 @@
     - [运行环境](#运行环境)
     - [模型权重](#模型权重)
     - [推理示例](#推理示例)
+    - [Bitsandbytes 量化执行](#bitsandbytes-量化执行)
 3. [aiXcoder 7B 训练数据](#aixcoder-7b-训练数据)
 4. [训练](#训练)
     - [训练超参数](#训练超参数)
@@ -240,6 +241,53 @@ def quick_sort(arr):
     quick_sort(arr[left + 1:])
     return arr</s>
 """
+```
+
+### Bitsandbytes 量化执行
+
+我们也能通过 `pip install bitsandbytes accelerate` 安装 Bitsandbytes，并简单加加上配置项执行int8或int4的量化推理（如果需要进一步压缩运行时申请的临时显存，推荐安装 FlashAttention）：
+
+```python
+
+import sys
+import torch
+from hf_mini.utils import input_wrapper
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig    
+
+# to use 4bit use `load_in_4bit=True` instead
+bnb_config = BitsAndBytesConfig(load_in_8bit=True) 
+
+device = "cuda" # the device to load the model onto
+
+tokenizer = AutoTokenizer.from_pretrained("aiXcoder/aixcoder-7b-base")
+model = AutoModelForCausalLM.from_pretrained("aiXcoder/aixcoder-7b-base", quantization_config=bnb_config, device_map=device, attn_implementation='flash_attention_2')
+
+text = input_wrapper(
+    code_string="# 快速排序算法",
+    later_code="\n",
+    path="test.py"
+)
+
+if len(text) == 0:
+    sys.exit()
+
+inputs = tokenizer(text, return_tensors="pt", return_token_type_ids=False)
+
+inputs = inputs.to(device)    
+
+outputs = model.generate(**inputs, max_new_tokens=256)
+print(f"Model memory footprint: {model.get_memory_footprint() / 2**20:.2f} MB")
+print(f"Torch max memory allocated: {torch.cuda.max_memory_allocated() / 2**20:.2f} MB")
+
+"""
+load_in_4bit=True:
+    - Model memory footprint: 5656.52 MB
+    - Torch max memory allocated: 6448.89 MB
+
+load_in_8bit=True:
+    - Model memory footprint: 9008.52 MB
+    - Torch max memory allocated: 10061.51 MB
+"""
 
 ```
 
diff --git a/hf_mini/utils.py b/hf_mini/utils.py
@@ -1090,17 +1090,19 @@
 
 
 import re
+import time
 from hf_mini.filter import SensitiveInforRM
 is_security = SensitiveInforRM()
 
 def input_wrapper(code_string, later_code: str = "", path: str = "") -> str:
 
+    start = time.time()
     _sequerity = True
     for i in [code_string, later_code, path]:
         if not is_security.is_security(i):
             _sequerity = False
             break
-    
+    print(f"Done inputs checking with {(time.time()-start) * 1000:.2f}ms", flush=True)
     if not _sequerity:
         return ""