max_seq_length = 76# if max_seq_length is 77 as in the original model, the validation fails, see details at the end of the notebook. Set max_seq_length to 76 works fine with the app. text_encoder_model = ct.convert( traced_model, convert_to="mlprogram", minimum_deployment_target=ct.target.iOS16, inputs=[ct.TensorType(name="prompt", shape=[1,max_seq_length], dtype=np.int32)], outputs=[ct.TensorType(name="embOutput", dtype=np.float32), ct.TensorType(name="embOutput2", dtype=np.float32)], ) text_encoder_model.save("TextEncoder_float32_test.mlpackage")
model = ct.models.MLModel('TextEncoder_float32_test.mlpackage')
# Choose a tokenizer, here we use the clip tokenizer text = clip.tokenize("a photo of a cat") text = text[:,:max_seq_length]
# # Or use CLIPTokenizerFast # text = tokenizer("a photo of a cat", return_tensors="pt", padding="max_length", max_length=max_seq_length) # text = text.data['input_ids'].to(torch.int32)
predictions = model.predict({'prompt': text}) out = traced_model(text)
print("PyTorch TextEncoder ckpt out for \"a photo of a cat\":\n>>>", out[0][0, :10]) print("\nCoreML TextEncoder ckpt out for \"a photo of a cat\":\n>>>", predictions['embOutput'][0, :10])
3. 图片搜索 APP
将 CLIP 模型分别导出为 Image Encoder 和 Text Encoder,在第一次启动 APP 后,加载 Image Encoder 对相册图片进行向量计算,存储到数据库或本地文件缓存。输入文本,通过 Text Encoder 计算文本向量,遍历图片向量数据库,计算向量的余弦相似度和 TopK(n 个数中,找出最大的 k 个数),返回相应的图片数组。