Skip to content

Commit

Permalink
Fix embedding generation for T5 models
Browse files Browse the repository at this point in the history
  • Loading branch information
antas-marcin committed Oct 26, 2023
1 parent 1cc402c commit fb9d535
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 26 deletions.
11 changes: 9 additions & 2 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,15 @@ def startup_event():
if transformers_direct_tokenize is not None and transformers_direct_tokenize == "true" or transformers_direct_tokenize == "1":
direct_tokenize = True

meta_config = Meta('./models/model')
vec = Vectorizer('./models/model', cuda_support, cuda_core, cuda_per_process_memory_fraction,
def get_model_directory() -> str:
if os.path.exists("./models/model/model_name"):
with open("./models/model/model_name", "r") as f:
model_name = f.read()
return f"./models/model/{model_name}"
return "./models/model"

meta_config = Meta(get_model_directory())
vec = Vectorizer(get_model_directory(), cuda_support, cuda_core, cuda_per_process_memory_fraction,
meta_config.getModelType(), meta_config.get_architecture(), direct_tokenize)


Expand Down
35 changes: 22 additions & 13 deletions download.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
AutoTokenizer,
AutoConfig,
)
from sentence_transformers import SentenceTransformer


model_dir = './models/model'
model_name = os.getenv('MODEL_NAME', None)
force_automodel = os.getenv('FORCE_AUTOMODEL', False)
if not model_name:
Expand All @@ -22,21 +24,28 @@

print(f"Downloading model {model_name} from huggingface model hub")
config = AutoConfig.from_pretrained(model_name)
if config.architectures and not force_automodel:
print(f"Using class {config.architectures[0]} to load model weights")
mod = __import__('transformers', fromlist=[config.architectures[0]])
try:
klass_architecture = getattr(mod, config.architectures[0])
model = klass_architecture.from_pretrained(model_name)
except AttributeError:
print(f"{config.architectures[0]} not found in transformers, fallback to AutoModel")
model = AutoModel.from_pretrained(model_name)
model_type = config.to_dict()['model_type']

if model_type is not None and model_type == "t5":
SentenceTransformer(model_name, cache_folder=model_dir)
with open(f"{model_dir}/model_name", "w") as f:
f.write(model_name.replace("/", "_"))
else:
model = AutoModel.from_pretrained(model_name)
if config.architectures and not force_automodel:
print(f"Using class {config.architectures[0]} to load model weights")
mod = __import__('transformers', fromlist=[config.architectures[0]])
try:
klass_architecture = getattr(mod, config.architectures[0])
model = klass_architecture.from_pretrained(model_name)
except AttributeError:
print(f"{config.architectures[0]} not found in transformers, fallback to AutoModel")
model = AutoModel.from_pretrained(model_name)
else:
model = AutoModel.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.save_pretrained('./models/model')
tokenizer.save_pretrained('./models/model')
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

nltk.download('punkt', download_dir='./nltk_data')
1 change: 1 addition & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ uvicorn==0.21.1
nltk==3.8.1
torch==2.0.0
sentencepiece==0.1.97
sentence-transformers==2.2.2
pytest
9 changes: 5 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
transformers==4.29.2
fastapi==0.95.2
uvicorn==0.22.0
transformers==4.34.0
fastapi==0.103.2
uvicorn==0.23.2
nltk==3.8.1
torch==2.0.1
torch==2.1.0
sentencepiece==0.1.99
sentence-transformers==2.2.2
1 change: 1 addition & 0 deletions smoke_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def try_to_vectorize(url):
# aware of 384 and 768 dim vectors, which should both fall in that
# range
self.assertTrue(len(resBody['vector']) > 100)
print(f"vector dimensions are: {len(resBody['vector'])}")

try_to_vectorize(self.url + "/vectors/")
try_to_vectorize(self.url + "/vectors")
Expand Down
43 changes: 36 additions & 7 deletions vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
DPRContextEncoder,
DPRQuestionEncoder,
)
from sentence_transformers import SentenceTransformer


# limit transformer batch size to limit parallel inference, otherwise we run
Expand All @@ -29,14 +30,47 @@ class VectorInput(BaseModel):
text: str
config: Optional[VectorInputConfig] = None


class Vectorizer:
executor: ThreadPoolExecutor

def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str, direct_tokenize: bool):
self.executor = ThreadPoolExecutor()
if model_type == 't5':
self.vectorizer = SentenceTransformerVectorizer(model_path, cuda_core)
else:
self.vectorizer = HuggingFaceVectorizer(model_path, cuda_support, cuda_core, cuda_per_process_memory_fraction, model_type, architecture, direct_tokenize)

async def vectorize(self, text: str, config: VectorInputConfig):
return await asyncio.wrap_future(self.executor.submit(self.vectorizer.vectorize, text, config))


class SentenceTransformerVectorizer:
model: SentenceTransformer
cuda_core: str

def __init__(self, model_path: str, cuda_core: str):
self.cuda_core = cuda_core
self.model = SentenceTransformer(model_path, device=self.get_device())
self.model.eval() # make sure we're in inference mode, not training

def get_device(self) -> Optional[str]:
if self.cuda_core is not None and self.cuda_core != "":
return self.cuda_core
return None

def vectorize(self, text: str, config: VectorInputConfig):
embedding = self.model.encode([text], device=self.get_device(), convert_to_tensor=False, convert_to_numpy=True)
return embedding[0]


class HuggingFaceVectorizer:
model: AutoModel
tokenizer: AutoTokenizer
cuda: bool
cuda_core: str
model_type: str
direct_tokenize: bool
executor: ThreadPoolExecutor

def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str, direct_tokenize: bool):
self.cuda = cuda_support
Expand All @@ -56,8 +90,6 @@ def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per

self.tokenizer = self.model_delegate.create_tokenizer(model_path)

self.executor = ThreadPoolExecutor()

nltk.data.path.append('./nltk_data')

def tokenize(self, text:str):
Expand All @@ -73,7 +105,7 @@ def get_batch_results(self, tokens, text):
def pool_embedding(self, batch_results, tokens, config):
return self.model_delegate.pool_embedding(batch_results, tokens, config)

def _vectorize(self, text: str, config: VectorInputConfig):
def vectorize(self, text: str, config: VectorInputConfig):
with torch.no_grad():
if self.direct_tokenize:
# create embeddings without tokenizing text
Expand All @@ -100,9 +132,6 @@ def _vectorize(self, text: str, config: VectorInputConfig):
batch_sum_vectors += self.pool_embedding(batch_results, tokens, config)
return batch_sum_vectors.detach() / num_sentences

async def vectorize(self, text: str, config: VectorInputConfig):
return await asyncio.wrap_future(self.executor.submit(self._vectorize, text, config))


class HFModel:

Expand Down

0 comments on commit fb9d535

Please sign in to comment.