Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 61 additions & 11 deletions pythonvectordbceph.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
import torch
import langchain
from langchain_text_splitters import CharacterTextSplitter

# this is need for only when second image embedding function is used
# from transformers import AutoFeatureExtractor, AutoModelForImageClassification
Expand Down Expand Up @@ -71,6 +73,10 @@ def __call__(self, imagepath):

object_type = os.getenv("OBJECT_TYPE")

chunk_size = int(os.getenv("CHUNK_SIZE"))

if chunk_size == None:
chunk_size = 1

app = Flask(__name__)

Expand All @@ -87,16 +93,24 @@ def pythonvectordbappceph():
event_type = event_data['Records'][0]['eventName']
app.logger.debug(object_key)
tags = event_data['Records'][0]['s3']['object']['tags']
app.logger.debug("tags : " + str(tags))
if len(tags) == 0:
tags = {}
#app.logger.debug("tags : " + str(tags))
# Create collection which includes the id, object url, and embedded vector
if not client.has_collection(collection_name=collection_name):
fields = [
FieldSchema(name='url', dtype=DataType.VARCHAR, max_length=2048, is_primary=True), # VARCHARS need a maximum length, so for this example they are set to 200 characters
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
FieldSchema(name='url', dtype=DataType.VARCHAR, max_length=2048), # VARCHARS need a maximum length, so for this example they are set to 200 characters
FieldSchema(name='embedded_vector', dtype=DataType.FLOAT_VECTOR, dim=int(os.getenv("VECTOR_DIMENSION"))),
FieldSchema(name='start_offset', dtype=DataType.INT64, nullable=True),
FieldSchema(name='end_offset', dtype=DataType.INT64, nullable=True),
FieldSchema(name='tags', dtype=DataType.JSON, nullable=True)
]
#app.logger.debug(fields)
schema = CollectionSchema(fields=fields, enable_dynamic_field=True)
#app.logger.debug(schema)
client.create_collection(collection_name=collection_name, schema=schema)

index_params = client.prepare_index_params()
index_params.add_index(field_name="embedded_vector", metric_type="L2", index_type="IVF_FLAT", params={"nlist": 16384})
client.create_index(collection_name=collection_name, index_params=index_params)
Expand All @@ -116,11 +130,30 @@ def pythonvectordbappceph():
case "TEXT":
object_content = object_data["Body"].read().decode("utf-8")
objectlist = []
objectlist.append(object_content)
# default embedding function provided by milvus, it has some size limtation for the object
# embedding_fn = milvus_model.DefaultEmbeddingFunction() #dimension 768
if chunk_size < 1:
app.logger.error("chunk size cannot be less than zero")
return
if chunk_size > 1:
object_size = object_data["ContentLength"]
if object_size == 0 :
app.logger.debug("object size zero cannot be chunked")
return
text_splitter = CharacterTextSplitter(
separator=".",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you split by . or by size?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it possible to demo chunking done by content (by the language model itself)?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you split by . or by size?

First check for ".", if not then chunking happen based on size

chunk_size = chunk_size,
chunk_overlap=0,
length_function=len,
is_separator_regex=False,
)
objectlist = text_splitter.split_text(object_content)
app.logger.debug("chunk size " + str(chunk_size) + " no of chunks " + str(len(objectlist)))
else :
objectlist.append(object_content)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do you append the entire object content to the object list?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if Chunking is disabled entire content is added together

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so, chunk_size=1 is the indication that chunking is disabled?
why not "0"?
also, what would be the value if the env var is not set?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will set it to one if it is not defined, missing in this PR

# default embedding function provided by milvus, it has some size limtation for the object
# embedding_fn = milvus_model.DefaultEmbeddingFunction() #dimension 768
embedding_fn = milvus_model.dense.SentenceTransformerEmbeddingFunction(model_name='all-MiniLM-L6-v2',device='cpu') # dimension 384
vectors = embedding_fn.encode_documents(objectlist)
app.logger.debug("vector length "+str(len(vectors)))
vector = vectors[0]

case "IMAGE":
Expand All @@ -144,14 +177,31 @@ def pythonvectordbappceph():
case _:
app.logger.error("Unknown object format")

app.logger.debug(vector)

if len(tags) > 0:
data = [ {"embedded_vector": vector, "url": object_url, "tags": tags} ]
# delete entries already existing entries, otherwise duplicate entries is possible
res = client.delete(collection_name=collection_name,
filter="url in "+ object_url)
#app.logger.debug(res)

#app.logger.debug(vector)
data = []
# null value is not working as expected. The attribute is not set properly
if chunk_size > 1:
start_offset = 0
for i in range(len(objectlist)):
end_offset = start_offset + len(objectlist[i])
if len(tags) > 0:
data.append({"embedded_vector": vectors[i], "url": object_url, "start_offset": start_offset, "end_offset": end_offset, "tags" : tags})
else:
data.append({"embedded_vector": vectors[i], "url": object_url, "start_offset": start_offset, "end_offset": end_offset})
start_offset = end_offset + 1
else:
data = [ {"embedded_vector": vector, "url": object_url} ]
if len(tags) > 0:
data.append({"embedded_vector": vector, "url": object_url, "tags": tags})
else:
data.append({"embedded_vector": vector, "url": object_url})

res = client.upsert(collection_name=collection_name, data=data)
#app.logger.debug(data)
res = client.insert(collection_name=collection_name, data=data)
app.logger.debug(res)
return ''

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ torch
timm
scikit-learn
sentence-transformers
langchain
1 change: 1 addition & 0 deletions sample-deployment-text.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ data:
MILVUS_ENDPOINT : "http://my-release-milvus.default.svc:19530"
OBJECT_TYPE : "TEXT"
VECTOR_DIMENSION: "384"
# CHUNK_SIZE : "500"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why under comment?