方法一
加载Pipeline
从transformers库中加载FeatureExtractionPipeline。如果知道要使用的模型的所有输入形状和配置,则可以不需要此步骤。使用 convert_graph_to_onnx 中的相应函数可以显着简化自定义模型的创建。生成的变量将用于torch导出调用。
from transformers import FeatureExtractionPipeline, AutoModel, AutoTokenizer, convert_graph_to_onnx
model_access = "my_model_dir"
model_pipeline = FeatureExtractionPipeline(
model=AutoModel.from_pretrained(model_access),
tokenizer=AutoTokenizer.from_pretrained(model_access, use_fast=True),
framework="pt",
device=-1)
config = model_pipeline.model.config
tokenizer = model_pipeline.tokenizer
with torch.no_grad():
input_names, output_names, dynamic_axes, tokens = convert_graph_to_onnx.infer_shapes(model_pipeline, "pt")
ordered_input_names, model_args = convert_graph_to_onnx.ensure_valid_input(
model_pipeline.model, tokens, input_names)
# 如果想添加更多的输出,则必须相应地修改dynamic_axes和output_names。
del dynamic_axes["output_0"] # Delete unused output
del dynamic_axes["output_1"] # Delete unused output
output_names = ["output"]
dynamic_axes["output"] = {0: 'batch'}
# 导出模型到ONNX
model = torch.load("best_model.pth")
output = "best_model.onnx"
torch.onnx.export(
model,
model_args,
f=output,
input_names=input_names,
output_names=output_names,
dynamic_axes=dynamic_axes,
do_constant_folding=True,
use_external_data_format=False,
enable_onnx_checker=True,
opset_version=11)
# 检查 onnx model
onnx_model = onnx.load(output)
onnx.checker.check_model(onnx_model)
print('The model is checked!')
加载onnx模型进行推理
import numpy as np
from transformers import AutoTokenizer
import onnxruntime as rt
import time
onnx_model_path = "best_model.onnx"
model_path = "best-checkpoint" # my_model_dir
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
span = "输入" # 输入的文本
opt = rt.SessionOptions()
sess = rt.InferenceSession(onnx_model_path) # Loads the model
t0 = time.perf_counter()
model_input = tokenizer.encode_plus(span)
model_input = {name: np.atleast_2d(value) for name, value in model_input.items()}
onnx_result = sess.run(None, model_input)
onnx_result = onnx_result[0]
onnx_result = np.argmax(onnx_result, axis=-1)
print(time.perf_counter() - t0)
方法二
pt2onnx
import os
import torch
from transformers import BertTokenizer, get_linear_schedule_with_warmup,BertForSequenceClassification, AdamW
import onnx
def pt2onnx_bert():
pretrained_model = '../model/bert-base-chinese'
onnx_path = 'api/onnx/bert-base-chinese-cls.onnx'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained(pretrained_model)
model = BertForSequenceClassification.from_pretrained(pretrained_model)
model.eval()
model.to(device)
input_names = ['input_ids', 'attention_mask', 'token_type_ids']
outputs_names = ['output']
# 构造输入
inputs = '输入'
encode_dict = tokenizer.encode_plus(text=inputs,
max_length=512,
pad_to_max_length=True,
return_tensors='pt',
return_token_type_ids=True,
return_attention_mask=True)
input_ids = encode_dict['input_ids']
attention_mask = encode_dict['attention_mask']
token_type_ids = encode_dict['token_type_ids']
dummy_input = {
'input_ids': input_ids,
'attention_mask': attention_mask,
'token_type_ids': token_type_ids
}
with torch.no_grad():
torch.onnx.export(model=model,
args=tuple(dummy_input.values()),
f=onnx_path,
opset_version=11,
input_names=input_names,
output_names=outputs_names,
dynamic_axes={'input_ids': {0: 'batch_size'},
'attention_mask': {0: 'batch_size'},
'token_type_ids': {0: 'batch_size'},
'output': {0: 'batch_size'}}
)
# 验证
print(onnx.checker.check_model(onnx_path))
推理
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers
def create_model_for_provider(model_path: str, provider: str) -> InferenceSession:
assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}"
options = SessionOptions()
#控制线程数
options.intra_op_num_threads = 0
options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
session = InferenceSession(model_path, options, providers=[provider])
return session
def onnx_inference(x):
#CPUExecutionProvider
ort_session = create_model_for_provider(onnx_model_path, 'CUDAExecutionProvider')
encode_dict = tokenizer.batch_encode_plus(batch_text_or_text_pairs=x,
max_length=MAX_SEQ_LEN,
pad_to_max_length=True,
return_tensors='pt',
return_token_type_ids=False,
return_attention_mask=True)
inputs = {k: v.numpy() for k, v in encode_dict.items()}
outputs = ort_session.run(None, inputs)
# 不同的模型对应不同的以下代码
outputs = outputs[0]
outputs = np.argmax(outputs, axis=-1)
result = []
for out in outputs:
result.append(id2label[out])
return result
Cython编译python
from distutils.core import setup
from Cython.Build import cythonize
from distutils.extension import Extension
extensions = [
Extension(name='name1',
sources=['source file'],
include_dirs=['head files'],
libraries=['library names'],
library_dirs=['library directories']),
Extension(name='name2',
sources=['source file'],
include_dirs=['head files'],
libraries=['library names'],
library_dirs=['library directories'])
]
# setup.py 放在 sources同目录
# 按照上述方式配置,会生成两个so,name1.so和name2.so
# 命令:python setup.pu build_ext,同时生成c文件和so
setup(
name='name',
# language_level=3指定python3
ext_modules=cythonize(extensions, language_level=3)
)