Add schema generator (#77)

tylerhutcherson · web-flow · commit 2f23e1070773 · 2023-11-16T16:13:20.000-05:00
The schema generator is responsible for taking a dictionary of
key/values, inferring types, and converting to a dictionary of redisvl schema.
diff --git a/redisvl/schema.py b/redisvl/schema.py
@@ -158,3 +158,96 @@ def read_schema(file_path: str):
         schema = yaml.safe_load(f)
 
     return SchemaModel(**schema)
+
+
+class MetadataSchemaGenerator:
+    """
+    A class to generate a schema for metadata, categorizing fields into text, numeric, and tag types.
+    """
+
+    def _test_numeric(self, value) -> bool:
+        """
+        Test if the given value can be represented as a numeric value.
+
+        Args:
+            value: The value to test.
+
+        Returns:
+            bool: True if the value can be converted to float, False otherwise.
+        """
+        try:
+            float(value)
+            return True
+        except (ValueError, TypeError):
+            return False
+
+    def _infer_type(self, value) -> Optional[str]:
+        """
+        Infer the type of the given value.
+
+        Args:
+            value: The value to infer the type of.
+
+        Returns:
+            Optional[str]: The inferred type of the value, or None if the type is unrecognized or the value is empty.
+        """
+        if value is None or value == "":
+            return None
+        elif self._test_numeric(value):
+            return "numeric"
+        elif isinstance(value, (list, set, tuple)) and all(
+            isinstance(v, str) for v in value
+        ):
+            return "tag"
+        elif isinstance(value, str):
+            return "text"
+        else:
+            return "unknown"
+
+    def generate(
+        self, metadata: Dict[str, Any], strict: Optional[bool] = False
+    ) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Generate a schema from the provided metadata.
+
+        This method categorizes each metadata field into text, numeric, or tag types based on the field values.
+        It also allows forcing strict type determination by raising an exception if a type cannot be inferred.
+
+        Args:
+            metadata: The metadata dictionary to generate the schema from.
+            strict: If True, the method will raise an exception for fields where the type cannot be determined.
+
+        Returns:
+            Dict[str, List[Dict[str, Any]]]: A dictionary with keys 'text', 'numeric', and 'tag', each mapping to a list of field schemas.
+
+        Raises:
+            ValueError: If the force parameter is True and a field's type cannot be determined.
+        """
+        result: Dict[str, List[Dict[str, Any]]] = {"text": [], "numeric": [], "tag": []}
+
+        for key, value in metadata.items():
+            field_type = self._infer_type(value)
+
+            if field_type in ["unknown", None]:
+                if strict:
+                    raise ValueError(
+                        f"Unable to determine field type for key '{key}' with value '{value}'"
+                    )
+                print(
+                    f"Warning: Unable to determine field type for key '{key}' with value '{value}'"
+                )
+                continue
+
+            # Extract the field class with defaults
+            field_class = {
+                "text": TextFieldSchema,
+                "tag": TagFieldSchema,
+                "numeric": NumericFieldSchema,
+            }.get(
+                field_type  # type: ignore
+            )
+
+            if field_class:
+                result[field_type].append(field_class(name=key).dict(exclude_none=True))  # type: ignore
+
+        return result
diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py
@@ -12,7 +12,8 @@
 @pytest.fixture
 def skip_vectorizer() -> bool:
     # os.getenv returns a string
-    return os.getenv("SKIP_VECTORIZERS", 'False').lower() == 'true'
+    return os.getenv("SKIP_VECTORIZERS", "False").lower() == "true"
+
 
 skip_vectorizer_test = lambda: pytest.config.getfixturevalue("skip_vectorizer")
 
diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py
@@ -12,6 +12,7 @@
     FlatVectorField,
     GeoFieldSchema,
     HNSWVectorField,
+    MetadataSchemaGenerator,
     NumericFieldSchema,
     SchemaModel,
     TagFieldSchema,
@@ -167,3 +168,71 @@ def test_schema_model_validation_failures():
 def test_read_schema_file_not_found():
     with pytest.raises(FileNotFoundError):
         read_schema("non_existent_file.yaml")
+
+
+# Fixture for the generator instance
+@pytest.fixture
+def schema_generator():
+    return MetadataSchemaGenerator()
+
+
+# Test cases for _test_numeric
+@pytest.mark.parametrize(
+    "value, expected",
+    [
+        (123, True),
+        ("123", True),
+        ("123.45", True),
+        ("abc", False),
+        (None, False),
+        ("", False),
+    ],
+)
+def test_test_numeric(schema_generator, value, expected):
+    assert schema_generator._test_numeric(value) == expected
+
+
+# Test cases for _infer_type
+@pytest.mark.parametrize(
+    "value, expected",
+    [
+        (123, "numeric"),
+        ("123", "numeric"),
+        (["tag1", "tag2"], "tag"),
+        ("text", "text"),
+        (None, None),
+        ("", None),
+        ({"key": "value"}, "unknown"),
+    ],
+)
+def test_infer_type(schema_generator, value, expected):
+    assert schema_generator._infer_type(value) == expected
+
+
+# Test cases for generate
+@pytest.mark.parametrize(
+    "metadata, strict, expected",
+    [
+        (
+            {"name": "John", "age": 30, "tags": ["friend", "colleague"]},
+            False,
+            {
+                "text": [TextFieldSchema(name="name").dict(exclude_none=True)],
+                "numeric": [NumericFieldSchema(name="age").dict(exclude_none=True)],
+                "tag": [TagFieldSchema(name="tags").dict(exclude_none=True)],
+            },
+        ),
+        (
+            {"invalid": {"nested": "dict"}},
+            False,
+            {"text": [], "numeric": [], "tag": []},
+        ),
+        ({"invalid": {"nested": "dict"}}, True, pytest.raises(ValueError)),
+    ],
+)
+def test_generate(schema_generator, metadata, strict, expected):
+    if not isinstance(expected, dict):
+        with expected:
+            schema_generator.generate(metadata, strict)
+    else:
+        assert schema_generator.generate(metadata, strict) == expected