CLIP

ben.wangzLess than 1 minute

CLIP

Introduction

prepare pre trained model

curl -LO https://huggingface.co/laion/CLIP-ViT-B-16-laion2B-s34B-b88K/resolve/main/open_clip_pytorch_model.bin

prepare base image

the dependency open-clip-torch is too big to install, so we need to prepare a base image with it installed

prepare base.dockerfile

FROM docker.io/library/python:3.12.1-bullseye

RUN pip install -i https://mirrors.aliyun.com/pypi/simple/ open-clip-torch==2.24.0

build the base image

podman build -t open-clip-base:latest -f base.dockerfile .

Text Vectorization

prepare text-vectorization.py

import os
import torch
import open_clip

model_path = os.getenv("MODEL_PATH", default="/app/model/wiki.en.bin")
model_name = os.getenv("MODEL_NAME", default="ViT-B-16")
sentence = os.getenv("SENTENCE", default="hello world")
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name="ViT-B-16", pretrained=model_path, device=device
)
tokenizer = open_clip.get_tokenizer(model_name="ViT-B-16")
text = tokenizer([sentence])
with torch.no_grad(), torch.cuda.amp.autocast():
    text_features = model.encode_text(text)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    print(text_features)

run with container

podman run --rm \
    -v $(pwd)/open_clip_pytorch_model.bin:/app/model/open_clip_pytorch_model.bin \
    -v $(pwd)/text-vectorization.py:/app/text-vectorization.py \
    -e MODEL_PATH=/app/model/open_clip_pytorch_model.bin \
    -e MODEL_NAME=ViT-B-16 \
    -e SENTENCE="On a freezing New Year's Eve, a poor young girl, shivering, bareheaded and barefoot, unsuccessfully tries to sell matches in the street." \
    -it localhost/clip-test:latest \
        python3 /app/text-vectorization.py

Image Vectorization

prepare image-vectorization.py

import os
import torch
import open_clip

from PIL import Image

model_path = os.getenv("MODEL_PATH", default="/app/model/wiki.en.bin")
model_name = os.getenv("MODEL_NAME", default="ViT-B-16")
image_path = os.getenv("IMAGE_PATH", default="/app/image.jpg")
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name="ViT-B-16", pretrained=model_path, device=device
)
image = preprocess(Image.open(image_path)).unsqueeze(0)
with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    print(image_features)

prepare image.jpg
- image.jpg

run with container

podman run --rm \
    -v $(pwd)/open_clip_pytorch_model.bin:/app/model/open_clip_pytorch_model.bin \
    -v $(pwd)/image-vectorization.py:/app/image-vectorization.py \
    -v $(pwd)/image.jpg:/app/image.jpg \
    -e MODEL_PATH=/app/model/open_clip_pytorch_model.bin \
    -e MODEL_NAME=ViT-B-16 \
    -e IMAGE_PATH=/app/image.jpg \
    -it localhost/clip-test:latest \
        python3 /app/image-vectorization.py

reference

https://github.com/openai/CLIP
https://github.com/mlfoundations/open_clip
https://huggingface.co/laion/CLIP-ViT-B-16-laion2B-s34B-b88K