From 91afb17b87d2131059a099ecfdeacf45f673a42a Mon Sep 17 00:00:00 2001 From: chandlerNick Date: Fri, 13 Mar 2026 14:05:18 +0100 Subject: [PATCH 1/5] Add create errors with config in api/high_level.py --- tab_err/api/high_level.py | 45 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/tab_err/api/high_level.py b/tab_err/api/high_level.py index 7570981..6724f27 100644 --- a/tab_err/api/high_level.py +++ b/tab_err/api/high_level.py @@ -197,6 +197,49 @@ def create_errors( # noqa: PLR0913 error_mechanisms_to_include: list[ErrorMechanism] | None = None, error_mechanisms_to_exclude: list[ErrorMechanism] | None = None, seed: int | None = None, +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Creates errors in a given DataFrame, at a rate of *approximately* max_error_rate. Allows for terse usage of create errors with config when information about errors is unnecessary. + + Args: + data (pd.DataFrame): The pandas DataFrame to create errors in. + error_rate (float): The maximum error rate to be introduced to each column in the DataFrame. + n_error_models_per_column (int, optional): The number of valid error models to apply to each column. Defaults to 1. + error_types_to_include (list[ErrorType] | None, optional): A list of the error types to be included when building error models. Defaults to None. + error_types_to_exclude (list[ErrorType] | None, optional): A list of the error types to be excluded when building error models. Defaults to None. + When both error_types_to_include and error_types_to_exclude are none, the maximum number of default error types will be used. + At least one must be None or an error will occur. + error_mechanisms_to_include (list[ErrorMechanism] | None = None): A list of the error mechanisms to be included when building error models. + Defaults to None. + error_mechanisms_to_exclude (list[ErrorMechanism] | None = None): A list of the error mechanisms to be excluded when building error models. + Defaults to None. + seed (int | None, optional): Random seed. Defaults to None. + + Returns: + tuple[pd.DataFrame, pd.DataFrame]: + - The first element is a copy of 'data' with errors. + - The second element is the associated error mask. + """ + return create_errors_with_config( + data=data, + error_rate=error_rate, + n_error_models_per_column=n_error_models_per_column, + error_types_to_include=error_types_to_include, + error_types_to_exclude=error_types_to_exclude, + error_mechanisms_to_include=error_mechanisms_to_include, + error_mechanisms_to_exclude=error_mechanisms_to_exclude, + seed=seed, + )[:2] # Drop the config from the return for this function. + + +def create_errors_with_config( # noqa: PLR0913 + data: pd.DataFrame, + error_rate: float, + n_error_models_per_column: int = 1, + error_types_to_include: list[ErrorType] | None = None, + error_types_to_exclude: list[ErrorType] | None = None, + error_mechanisms_to_include: list[ErrorMechanism] | None = None, + error_mechanisms_to_exclude: list[ErrorMechanism] | None = None, + seed: int | None = None, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Creates errors in a given DataFrame, at a rate of *approximately* max_error_rate. @@ -266,4 +309,4 @@ def create_errors( # noqa: PLR0913 # Create Errors & Return dirty_data, error_mask = mid_level.create_errors(data_copy, config) - return dirty_data, error_mask + return dirty_data, error_mask, config From 1191bbdf29aa47a93d1444f7b46dc25f6cd00dd9 Mon Sep 17 00:00:00 2001 From: chandlerNick Date: Fri, 13 Mar 2026 14:34:14 +0100 Subject: [PATCH 2/5] Add __repr__ and __str__ methods to _error_model.py --- examples/1-Getting-Started.ipynb | 18 +- examples/Error_Types.ipynb | 326 +++++++++++++++---------------- tab_err/_error_model.py | 26 +++ 3 files changed, 205 insertions(+), 165 deletions(-) diff --git a/examples/1-Getting-Started.ipynb b/examples/1-Getting-Started.ipynb index ef44aca..75c669b 100644 --- a/examples/1-Getting-Started.ipynb +++ b/examples/1-Getting-Started.ipynb @@ -298,8 +298,6 @@ "```\n", "\n", "\n", - "\n", - "\n", "#### Example of Seed Specification\n", "\n", "When using `seed` and specifying the inclusion of specific `ErrorType` objects, the seed must be set in the constructor of these objects manually if fixing of the random generator is desired.\n", @@ -323,6 +321,22 @@ " error_mechanisms_to_exclude=[error_mechanism.EAR()],\n", " seed=42\n", " )\n", + "```\n", + "\n", + "#### Config Extraction\n", + "\n", + "The user can print each error model used in the high-level API as follows. Note that in the example, the two indices on the config object are the affected column name and the error model number. i.e. `config.columns[col-name][error-model-number]`.\n", + "\n", + "```python\n", + "from tab_err.api import high_level\n", + "\n", + "corrupted_data, error_mask, config = high_level.create_errors_with_config(\n", + " data=data\n", + " error_rate=0.5\n", + ")\n", + "\n", + "# If \"name\" were a column in data,\n", + "print(config.columns['name'][0])\n", "```" ] } diff --git a/examples/Error_Types.ipynb b/examples/Error_Types.ipynb index ebb87c9..be1811f 100644 --- a/examples/Error_Types.ipynb +++ b/examples/Error_Types.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "e031b356-c92e-4e6c-9422-ea968c81aa64", "metadata": {}, "outputs": [], @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 155, "id": "eaf1bf17-08f2-4627-a6bb-318b60e3528d", "metadata": {}, "outputs": [], @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 156, "id": "18931616-22c3-4f63-8e7d-c0710d924716", "metadata": {}, "outputs": [], @@ -110,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 157, "id": "351080b6-841f-4e5e-809f-0a237eaa4fe5", "metadata": {}, "outputs": [ @@ -249,7 +249,7 @@ "5 1.0 False False False " ] }, - "execution_count": 4, + "execution_count": 157, "metadata": {}, "output_type": "execute_result" } @@ -272,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 158, "id": "ec59d663-3e0a-45a8-a662-9ca8674f192c", "metadata": {}, "outputs": [ @@ -358,10 +358,10 @@ " 4.0\n", " Bob\n", " The Great Gatsby\n", - " 4.1\n", + " 4.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 4\n", @@ -370,10 +370,10 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " 2.0\n", - " False\n", + " 2.1\n", " False\n", " False\n", + " True\n", " \n", " \n", " 5\n", @@ -406,12 +406,12 @@ "0 1.0 False False False \n", "1 3.0 False False False \n", "2 3.0 False False False \n", - "3 4.1 False False True \n", - "4 2.0 False False False \n", + "3 4.0 False False False \n", + "4 2.1 False False True \n", "5 1.0 False False False " ] }, - "execution_count": 5, + "execution_count": 158, "metadata": {}, "output_type": "execute_result" } @@ -438,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 159, "id": "bcfc46a3-16dc-4427-b580-6d1dd09b9d87", "metadata": {}, "outputs": [ @@ -577,7 +577,7 @@ "5 1.0 False False False " ] }, - "execution_count": 6, + "execution_count": 159, "metadata": {}, "output_type": "execute_result" } @@ -600,7 +600,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 160, "id": "a07d9c15-c43c-48e0-beb4-b074855e557c", "metadata": {}, "outputs": [ @@ -650,10 +650,10 @@ " 1.0\n", " Alice\n", " To Kill a Mockingbird\n", - " 1.0\n", - " False\n", + " 9999.0\n", " False\n", " False\n", + " True\n", " \n", " \n", " 1\n", @@ -662,10 +662,10 @@ " 3.0\n", " Alice\n", " 1984\n", - " 9999.0\n", + " 3.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 2\n", @@ -731,15 +731,15 @@ "\n", " error_mask \n", " rating typist book_title rating \n", - "0 1.0 False False False \n", - "1 9999.0 False False True \n", + "0 9999.0 False False True \n", + "1 3.0 False False False \n", "2 3.0 False False False \n", "3 4.0 False False False \n", "4 2.0 False False False \n", "5 1.0 False False False " ] }, - "execution_count": 7, + "execution_count": 160, "metadata": {}, "output_type": "execute_result" } @@ -770,7 +770,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 161, "id": "4834399f-c5b6-4d3d-8bd8-2869ec4637a5", "metadata": {}, "outputs": [ @@ -825,11 +825,11 @@ " 1\n", " Alice\n", " ¿Cómo estás?\n", - " 11/10 12 a.m.\n", + " 12 a.m.\n", + " False\n", " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 1\n", @@ -870,11 +870,11 @@ " 2\n", " Bob\n", " Ça va bien, merci.\n", - " 4 a.m.\n", - " False\n", + " 11/10 4 a.m.\n", " False\n", " False\n", " False\n", + " True\n", " \n", " \n", " 4\n", @@ -920,17 +920,17 @@ "4 3 Clara ¡Nos vemos mañana! 1 p.m. 3 Clara \n", "5 4 David Ich hätte Hunger. 1 p.m. 4 David \n", "\n", - " error_mask \n", - " content timestamp user_id user content timestamp \n", - "0 ¿Cómo estás? 11/10 12 a.m. False False False True \n", - "1 Привет, как дела? 11/10 3 p.m. False False False True \n", - "2 今日はどうですか 3 p.m. False False False False \n", - "3 Ça va bien, merci. 4 a.m. False False False False \n", - "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", - "5 Ich hätte Hunger. 11/10 1 p.m. False False False True " + " error_mask \n", + " content timestamp user_id user content timestamp \n", + "0 ¿Cómo estás? 12 a.m. False False False False \n", + "1 Привет, как дела? 11/10 3 p.m. False False False True \n", + "2 今日はどうですか 3 p.m. False False False False \n", + "3 Ça va bien, merci. 11/10 4 a.m. False False False True \n", + "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", + "5 Ich hätte Hunger. 11/10 1 p.m. False False False True " ] }, - "execution_count": 8, + "execution_count": 161, "metadata": {}, "output_type": "execute_result" } @@ -956,7 +956,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 162, "id": "948c980c-9b16-4bde-b63b-9e450a7e1ac5", "metadata": {}, "outputs": [ @@ -1023,11 +1023,11 @@ " Alice\n", " Привет, как дела?\n", " 3 p.m.\n", - " 1\n", + " 1.0\n", " Alice\n", " Привет, как дела?\n", " 3 p.m.\n", - " False\n", + " True\n", " False\n", " False\n", " False\n", @@ -1068,11 +1068,11 @@ " Clara\n", " ¡Nos vemos mañana!\n", " 1 p.m.\n", - " 3.0\n", + " 3\n", " Clara\n", " ¡Nos vemos mañana!\n", " 1 p.m.\n", - " True\n", + " False\n", " False\n", " False\n", " False\n", @@ -1100,23 +1100,23 @@ " original perturbed \\\n", " user_id user content timestamp user_id user \n", "0 1 Alice ¿Cómo estás? 12 a.m. 1 Alice \n", - "1 1 Alice Привет, как дела? 3 p.m. 1 Alice \n", + "1 1 Alice Привет, как дела? 3 p.m. 1.0 Alice \n", "2 2 Bob 今日はどうですか 3 p.m. 2.0 Bob \n", "3 2 Bob Ça va bien, merci. 4 a.m. 2 Bob \n", - "4 3 Clara ¡Nos vemos mañana! 1 p.m. 3.0 Clara \n", + "4 3 Clara ¡Nos vemos mañana! 1 p.m. 3 Clara \n", "5 4 David Ich hätte Hunger. 1 p.m. 4.0 David \n", "\n", " error_mask \n", " content timestamp user_id user content timestamp \n", "0 ¿Cómo estás? 12 a.m. False False False False \n", - "1 Привет, как дела? 3 p.m. False False False False \n", + "1 Привет, как дела? 3 p.m. True False False False \n", "2 今日はどうですか 3 p.m. True False False False \n", "3 Ça va bien, merci. 4 a.m. False False False False \n", - "4 ¡Nos vemos mañana! 1 p.m. True False False False \n", + "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", "5 Ich hätte Hunger. 1 p.m. True False False False " ] }, - "execution_count": 9, + "execution_count": 162, "metadata": {}, "output_type": "execute_result" } @@ -1141,7 +1141,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 163, "id": "b9606b2a-b26f-493d-bba1-f61803aa2a4f", "metadata": {}, "outputs": [ @@ -1195,11 +1195,11 @@ " 12 a.m.\n", " 1\n", " Alice\n", - " ¿Cómo estás?\n", + " Cmo est疽?\n", " 12 a.m.\n", " False\n", " False\n", - " False\n", + " True\n", " False\n", " \n", " \n", @@ -1210,11 +1210,11 @@ " 3 p.m.\n", " 1\n", " Alice\n", - " Ďđčâĺň, ęŕę äĺëŕ?\n", + " Привет, как дела?\n", " 3 p.m.\n", " False\n", " False\n", - " True\n", + " False\n", " False\n", " \n", " \n", @@ -1225,11 +1225,11 @@ " 3 p.m.\n", " 2\n", " Bob\n", - " 今日はどうですか\n", + " \n", " 3 p.m.\n", " False\n", " False\n", - " False\n", + " True\n", " False\n", " \n", " \n", @@ -1240,7 +1240,7 @@ " 4 a.m.\n", " 2\n", " Bob\n", - " a va bien, merci.\n", + " ヌa va bien, merci.\n", " 4 a.m.\n", " False\n", " False\n", @@ -1270,11 +1270,11 @@ " 1 p.m.\n", " 4\n", " David\n", - " Ich htte Hunger.\n", + " Ich hätte Hunger.\n", " 1 p.m.\n", " False\n", " False\n", - " True\n", + " False\n", " False\n", " \n", " \n", @@ -1293,15 +1293,15 @@ "\n", " error_mask \n", " content timestamp user_id user content timestamp \n", - "0 ¿Cómo estás? 12 a.m. False False False False \n", - "1 Ďđčâĺň, ęŕę äĺëŕ? 3 p.m. False False True False \n", - "2 今日はどうですか 3 p.m. False False False False \n", - "3 a va bien, merci. 4 a.m. False False True False \n", + "0 Cmo est疽? 12 a.m. False False True False \n", + "1 Привет, как дела? 3 p.m. False False False False \n", + "2 3 p.m. False False True False \n", + "3 ヌa va bien, merci. 4 a.m. False False True False \n", "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", - "5 Ich htte Hunger. 1 p.m. False False True False " + "5 Ich hätte Hunger. 1 p.m. False False False False " ] }, - "execution_count": 10, + "execution_count": 163, "metadata": {}, "output_type": "execute_result" } @@ -1326,7 +1326,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 164, "id": "2c3b59e7-f651-424b-94b0-dcd485e0c15d", "metadata": {}, "outputs": [ @@ -1376,7 +1376,7 @@ " 1.0\n", " Alice\n", " To Kill a Mockingbird\n", - " -2.778344\n", + " -2.944719\n", " False\n", " False\n", " True\n", @@ -1388,7 +1388,7 @@ " 3.0\n", " Alice\n", " 1984\n", - " 6.235905\n", + " 6.516490\n", " False\n", " False\n", " True\n", @@ -1424,10 +1424,10 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " -1.759203\n", + " 2.000000\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 5\n", @@ -1436,10 +1436,10 @@ " 1.0\n", " Bob\n", " The Catcher in the Rye\n", - " 1.000000\n", - " False\n", + " -2.717591\n", " False\n", " False\n", + " True\n", " \n", " \n", "\n", @@ -1457,15 +1457,15 @@ "\n", " error_mask \n", " rating typist book_title rating \n", - "0 -2.778344 False False True \n", - "1 6.235905 False False True \n", + "0 -2.944719 False False True \n", + "1 6.516490 False False True \n", "2 3.000000 False False False \n", "3 4.000000 False False False \n", - "4 -1.759203 False False True \n", - "5 1.000000 False False False " + "4 2.000000 False False False \n", + "5 -2.717591 False False True " ] }, - "execution_count": 11, + "execution_count": 164, "metadata": {}, "output_type": "execute_result" } @@ -1490,7 +1490,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 165, "id": "affc303b-d72f-41fc-bc4c-ddac9303a289", "metadata": {}, "outputs": [ @@ -1530,8 +1530,8 @@ " \n", " 0\n", " service-A-2024-02-01\n", - " service-A-2024-02-01\n", - " False\n", + " 01-A-2024-service-02\n", + " True\n", " \n", " \n", " 1\n", @@ -1548,13 +1548,13 @@ " \n", " 3\n", " service-A-2024-02-01\n", - " 01-A-2024-service-02\n", + " service-A-2024-01-02\n", " True\n", " \n", " \n", " 4\n", " service-B-2024-02-02\n", - " B-service-2024-02-02\n", + " service-02-B-2024-02\n", " True\n", " \n", " \n", @@ -1566,19 +1566,19 @@ " \n", " 6\n", " service-C-2024-02-01\n", - " 01-02-service-C-2024\n", + " 02-01-2024-service-C\n", " True\n", " \n", " \n", " 7\n", " service-C-2024-02-02\n", - " C-service-02-2024-02\n", - " True\n", + " service-C-2024-02-02\n", + " False\n", " \n", " \n", " 8\n", " service-C-2024-02-03\n", - " 03-C-service-02-2024\n", + " service-02-03-C-2024\n", " True\n", " \n", " \n", @@ -1588,18 +1588,18 @@ "text/plain": [ " original perturbed error_mask\n", " service service service\n", - "0 service-A-2024-02-01 service-A-2024-02-01 False\n", + "0 service-A-2024-02-01 01-A-2024-service-02 True\n", "1 service-A-2024-02-02 service-A-2024-02-02 False\n", "2 service-A-2024-02-03 service-A-2024-02-03 False\n", - "3 service-A-2024-02-01 01-A-2024-service-02 True\n", - "4 service-B-2024-02-02 B-service-2024-02-02 True\n", + "3 service-A-2024-02-01 service-A-2024-01-02 True\n", + "4 service-B-2024-02-02 service-02-B-2024-02 True\n", "5 service-B-2024-02-03 service-B-2024-02-03 False\n", - "6 service-C-2024-02-01 01-02-service-C-2024 True\n", - "7 service-C-2024-02-02 C-service-02-2024-02 True\n", - "8 service-C-2024-02-03 03-C-service-02-2024 True" + "6 service-C-2024-02-01 02-01-2024-service-C True\n", + "7 service-C-2024-02-02 service-C-2024-02-02 False\n", + "8 service-C-2024-02-03 service-02-03-C-2024 True" ] }, - "execution_count": 12, + "execution_count": 165, "metadata": {}, "output_type": "execute_result" } @@ -1622,7 +1622,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 166, "id": "eae650d0-d6a9-4599-8c0f-8f4216ea7b63", "metadata": {}, "outputs": [ @@ -1662,7 +1662,7 @@ " \n", " 0\n", " service-A-2024-02-01\n", - " service-01-A-02-2024\n", + " 01-02-2024-A-service\n", " True\n", " \n", " \n", @@ -1674,13 +1674,13 @@ " \n", " 2\n", " service-A-2024-02-03\n", - " service-03-A-02-2024\n", - " True\n", + " service-A-2024-02-03\n", + " False\n", " \n", " \n", " 3\n", " service-A-2024-02-01\n", - " service-01-A-02-2024\n", + " 01-02-2024-A-service\n", " True\n", " \n", " \n", @@ -1698,20 +1698,20 @@ " \n", " 6\n", " service-C-2024-02-01\n", - " service-01-C-02-2024\n", + " 01-02-2024-C-service\n", " True\n", " \n", " \n", " 7\n", " service-C-2024-02-02\n", - " service-02-C-02-2024\n", + " 02-02-2024-C-service\n", " True\n", " \n", " \n", " 8\n", " service-C-2024-02-03\n", - " service-C-2024-02-03\n", - " False\n", + " 03-02-2024-C-service\n", + " True\n", " \n", " \n", "\n", @@ -1720,18 +1720,18 @@ "text/plain": [ " original perturbed error_mask\n", " service service service\n", - "0 service-A-2024-02-01 service-01-A-02-2024 True\n", + "0 service-A-2024-02-01 01-02-2024-A-service True\n", "1 service-A-2024-02-02 service-A-2024-02-02 False\n", - "2 service-A-2024-02-03 service-03-A-02-2024 True\n", - "3 service-A-2024-02-01 service-01-A-02-2024 True\n", + "2 service-A-2024-02-03 service-A-2024-02-03 False\n", + "3 service-A-2024-02-01 01-02-2024-A-service True\n", "4 service-B-2024-02-02 service-B-2024-02-02 False\n", "5 service-B-2024-02-03 service-B-2024-02-03 False\n", - "6 service-C-2024-02-01 service-01-C-02-2024 True\n", - "7 service-C-2024-02-02 service-02-C-02-2024 True\n", - "8 service-C-2024-02-03 service-C-2024-02-03 False" + "6 service-C-2024-02-01 01-02-2024-C-service True\n", + "7 service-C-2024-02-02 02-02-2024-C-service True\n", + "8 service-C-2024-02-03 03-02-2024-C-service True" ] }, - "execution_count": 13, + "execution_count": 166, "metadata": {}, "output_type": "execute_result" } @@ -1756,7 +1756,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 167, "id": "0d36eba7-9e7b-42c3-8f3e-8d42588c53e7", "metadata": {}, "outputs": [ @@ -1796,8 +1796,8 @@ " \n", " 0\n", " service-A-2024-02-01\n", - " service_A_2024_02_01\n", - " True\n", + " service-A-2024-02-01\n", + " False\n", " \n", " \n", " 1\n", @@ -1808,14 +1808,14 @@ " \n", " 2\n", " service-A-2024-02-03\n", - " service_A_2024_02_03\n", - " True\n", + " service-A-2024-02-03\n", + " False\n", " \n", " \n", " 3\n", " service-A-2024-02-01\n", - " service-A-2024-02-01\n", - " False\n", + " service_A_2024_02_01\n", + " True\n", " \n", " \n", " 4\n", @@ -1832,20 +1832,20 @@ " \n", " 6\n", " service-C-2024-02-01\n", - " service_C_2024_02_01\n", - " True\n", + " service-C-2024-02-01\n", + " False\n", " \n", " \n", " 7\n", " service-C-2024-02-02\n", - " service-C-2024-02-02\n", - " False\n", + " service_C_2024_02_02\n", + " True\n", " \n", " \n", " 8\n", " service-C-2024-02-03\n", - " service-C-2024-02-03\n", - " False\n", + " service_C_2024_02_03\n", + " True\n", " \n", " \n", "\n", @@ -1854,18 +1854,18 @@ "text/plain": [ " original perturbed error_mask\n", " service service service\n", - "0 service-A-2024-02-01 service_A_2024_02_01 True\n", + "0 service-A-2024-02-01 service-A-2024-02-01 False\n", "1 service-A-2024-02-02 service-A-2024-02-02 False\n", - "2 service-A-2024-02-03 service_A_2024_02_03 True\n", - "3 service-A-2024-02-01 service-A-2024-02-01 False\n", + "2 service-A-2024-02-03 service-A-2024-02-03 False\n", + "3 service-A-2024-02-01 service_A_2024_02_01 True\n", "4 service-B-2024-02-02 service_B_2024_02_02 True\n", "5 service-B-2024-02-03 service_B_2024_02_03 True\n", - "6 service-C-2024-02-01 service_C_2024_02_01 True\n", - "7 service-C-2024-02-02 service-C-2024-02-02 False\n", - "8 service-C-2024-02-03 service-C-2024-02-03 False" + "6 service-C-2024-02-01 service-C-2024-02-01 False\n", + "7 service-C-2024-02-02 service_C_2024_02_02 True\n", + "8 service-C-2024-02-03 service_C_2024_02_03 True" ] }, - "execution_count": 14, + "execution_count": 167, "metadata": {}, "output_type": "execute_result" } @@ -1890,7 +1890,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 168, "id": "246551cb-946b-4732-92e9-6806b46f1d26", "metadata": {}, "outputs": [ @@ -1939,10 +1939,10 @@ " To Kill a Mockingbird\n", " 1.0\n", " Alice\n", - " To Kill a Mockingburd\n", + " To Kill a Mockingbird\n", " 1.0\n", " False\n", - " True\n", + " False\n", " False\n", " \n", " \n", @@ -1951,10 +1951,10 @@ " 1984\n", " 3.0\n", " Alice\n", - " 1984\n", + " 2984\n", " 3.0\n", " False\n", - " False\n", + " True\n", " False\n", " \n", " \n", @@ -1963,10 +1963,10 @@ " Pride and Prejudice\n", " 3.0\n", " Alice\n", - " Pride abd Prejudice\n", + " Pride and Prejudice\n", " 3.0\n", " False\n", - " True\n", + " False\n", " False\n", " \n", " \n", @@ -1987,10 +1987,10 @@ " Moby-Dick\n", " 2.0\n", " Bob\n", - " Moby-Dick\n", + " Moby-Didk\n", " 2.0\n", " False\n", - " False\n", + " True\n", " False\n", " \n", " \n", @@ -1999,7 +1999,7 @@ " The Catcher in the Rye\n", " 1.0\n", " Bob\n", - " The Catcher 9n the Rye\n", + " The Catcher in thr Rye\n", " 1.0\n", " False\n", " True\n", @@ -2012,24 +2012,24 @@ "text/plain": [ " original perturbed \\\n", " typist book_title rating typist book_title \n", - "0 Alice To Kill a Mockingbird 1.0 Alice To Kill a Mockingburd \n", - "1 Alice 1984 3.0 Alice 1984 \n", - "2 Alice Pride and Prejudice 3.0 Alice Pride abd Prejudice \n", + "0 Alice To Kill a Mockingbird 1.0 Alice To Kill a Mockingbird \n", + "1 Alice 1984 3.0 Alice 2984 \n", + "2 Alice Pride and Prejudice 3.0 Alice Pride and Prejudice \n", "3 Bob The Great Gatsby 4.0 Bob The Great Gatsby \n", - "4 Bob Moby-Dick 2.0 Bob Moby-Dick \n", - "5 Bob The Catcher in the Rye 1.0 Bob The Catcher 9n the Rye \n", + "4 Bob Moby-Dick 2.0 Bob Moby-Didk \n", + "5 Bob The Catcher in the Rye 1.0 Bob The Catcher in thr Rye \n", "\n", " error_mask \n", " rating typist book_title rating \n", - "0 1.0 False True False \n", - "1 3.0 False False False \n", - "2 3.0 False True False \n", + "0 1.0 False False False \n", + "1 3.0 False True False \n", + "2 3.0 False False False \n", "3 4.0 False False False \n", - "4 2.0 False False False \n", + "4 2.0 False True False \n", "5 1.0 False True False " ] }, - "execution_count": 15, + "execution_count": 168, "metadata": {}, "output_type": "execute_result" } @@ -2054,7 +2054,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 169, "id": "fbb7c9fd-6d14-4bff-a6c9-780117e6218b", "metadata": {}, "outputs": [ @@ -2128,10 +2128,10 @@ " 3.0\n", " Alice\n", " Pride and Prejudice\n", - " 30.0\n", + " 3.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 3\n", @@ -2140,10 +2140,10 @@ " 4.0\n", " Bob\n", " The Great Gatsby\n", - " 4.0\n", - " False\n", + " 40.0\n", " False\n", " False\n", + " True\n", " \n", " \n", " 4\n", @@ -2152,10 +2152,10 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " 20.0\n", + " 2.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 5\n", @@ -2164,10 +2164,10 @@ " 1.0\n", " Bob\n", " The Catcher in the Rye\n", - " 1.0\n", - " False\n", + " 10.0\n", " False\n", " False\n", + " True\n", " \n", " \n", "\n", @@ -2187,13 +2187,13 @@ " rating typist book_title rating \n", "0 10.0 False False True \n", "1 3.0 False False False \n", - "2 30.0 False False True \n", - "3 4.0 False False False \n", - "4 20.0 False False True \n", - "5 1.0 False False False " + "2 3.0 False False False \n", + "3 40.0 False False True \n", + "4 2.0 False False False \n", + "5 10.0 False False True " ] }, - "execution_count": 16, + "execution_count": 169, "metadata": {}, "output_type": "execute_result" } @@ -2209,7 +2209,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "tab-err", "language": "python", "name": "python3" }, @@ -2223,7 +2223,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.19" + "version": "3.12.11" } }, "nbformat": 4, diff --git a/tab_err/_error_model.py b/tab_err/_error_model.py index 4908e1c..63bcd7e 100644 --- a/tab_err/_error_model.py +++ b/tab_err/_error_model.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING from tab_err.api import low_level +from tab_err.error_mechanism import EAR if TYPE_CHECKING: import pandas as pd @@ -25,6 +26,31 @@ class ErrorModel: error_type: ErrorType error_rate: float + def __repr__(self) -> str: + """Unambiguous representation, evaluating the mechanism's repr but only the type's name.""" + if self.error_mechanism.__class__ == EAR: + return ( + f"{self.__class__.__name__}(" + f"error_mechanism={self.error_mechanism.__class__.__name__}(condition_to_column='{self.error_mechanism.condition_to_column}'), " + f"error_type={self.error_type.__class__.__name__}, " + f"error_rate={self.error_rate})" + ) + else: + return ( + f"{self.__class__.__name__}(" + f"error_mechanism={self.error_mechanism.__class__.__name__}, " + f"error_type={self.error_type.__class__.__name__}, " + f"error_rate={self.error_rate})" + ) + + def __str__(self) -> str: + """Readable representation for end-users.""" + # Assumes error_rate is a float like 0.05. Displays as 5.0%. + if self.error_mechanism.__class__ == EAR: + return f"ErrorModel: {self.error_rate:.1%} '{self.error_type.__class__.__name__}' errors via {self.error_mechanism.__class__.__name__} conditioning on column '{self.error_mechanism.condition_to_column}'" + else: + return f"ErrorModel: {self.error_rate:.1%} '{self.error_type.__class__.__name__}' errors via {self.error_mechanism.__class__.__name__}" + def apply(self: ErrorModel, data: pd.DataFrame, column: str | int) -> tuple[pd.DataFrame, pd.DataFrame]: """Applies the defined ErrorModel to the given column of a pandas DataFrame. From 380fec1c24fad24d8096a83678dad51e700dfd25 Mon Sep 17 00:00:00 2001 From: chandlerNick Date: Fri, 13 Mar 2026 14:45:48 +0100 Subject: [PATCH 3/5] Ruff and mypy --- tab_err/_error_model.py | 18 ++++++++---------- tab_err/api/high_level.py | 11 +++++++++-- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/tab_err/_error_model.py b/tab_err/_error_model.py index 63bcd7e..6f14e4c 100644 --- a/tab_err/_error_model.py +++ b/tab_err/_error_model.py @@ -35,21 +35,19 @@ def __repr__(self) -> str: f"error_type={self.error_type.__class__.__name__}, " f"error_rate={self.error_rate})" ) - else: - return ( - f"{self.__class__.__name__}(" - f"error_mechanism={self.error_mechanism.__class__.__name__}, " - f"error_type={self.error_type.__class__.__name__}, " - f"error_rate={self.error_rate})" - ) + return ( + f"{self.__class__.__name__}(" + f"error_mechanism={self.error_mechanism.__class__.__name__}, " + f"error_type={self.error_type.__class__.__name__}, " + f"error_rate={self.error_rate})" + ) def __str__(self) -> str: """Readable representation for end-users.""" # Assumes error_rate is a float like 0.05. Displays as 5.0%. if self.error_mechanism.__class__ == EAR: - return f"ErrorModel: {self.error_rate:.1%} '{self.error_type.__class__.__name__}' errors via {self.error_mechanism.__class__.__name__} conditioning on column '{self.error_mechanism.condition_to_column}'" - else: - return f"ErrorModel: {self.error_rate:.1%} '{self.error_type.__class__.__name__}' errors via {self.error_mechanism.__class__.__name__}" + return f"ErrorModel: {self.error_rate:.1%} '{self.error_type.__class__.__name__}' errors via {self.error_mechanism.__class__.__name__} conditioning on column '{self.error_mechanism.condition_to_column}'" # Noqa: E501 + return f"ErrorModel: {self.error_rate:.1%} '{self.error_type.__class__.__name__}' errors via {self.error_mechanism.__class__.__name__}" def apply(self: ErrorModel, data: pd.DataFrame, column: str | int) -> tuple[pd.DataFrame, pd.DataFrame]: """Applies the defined ErrorModel to the given column of a pandas DataFrame. diff --git a/tab_err/api/high_level.py b/tab_err/api/high_level.py index 6724f27..2f68442 100644 --- a/tab_err/api/high_level.py +++ b/tab_err/api/high_level.py @@ -198,7 +198,10 @@ def create_errors( # noqa: PLR0913 error_mechanisms_to_exclude: list[ErrorMechanism] | None = None, seed: int | None = None, ) -> tuple[pd.DataFrame, pd.DataFrame]: - """Creates errors in a given DataFrame, at a rate of *approximately* max_error_rate. Allows for terse usage of create errors with config when information about errors is unnecessary. + """Creates errors in a given DataFrame, at a rate of *approximately* max_error_rate. + + Description: + Functionally identical to `create_errors_with_config`, but allows for terser usage when detailed configuration is not needed. Args: data (pd.DataFrame): The pandas DataFrame to create errors in. @@ -240,9 +243,13 @@ def create_errors_with_config( # noqa: PLR0913 error_mechanisms_to_include: list[ErrorMechanism] | None = None, error_mechanisms_to_exclude: list[ErrorMechanism] | None = None, seed: int | None = None, -) -> tuple[pd.DataFrame, pd.DataFrame]: +) -> tuple[pd.DataFrame, pd.DataFrame, MidLevelConfig]: """Creates errors in a given DataFrame, at a rate of *approximately* max_error_rate. + Description: + Builds a configuration of error models to apply to the DataFrame based on the input parameters and then applies the error models to the DataFrame. + Returns the dirty DataFrame, the error mask, and the configuration for reproducibility. + Args: data (pd.DataFrame): The pandas DataFrame to create errors in. error_rate (float): The maximum error rate to be introduced to each column in the DataFrame. From c8b9d1505f4bde6fe41c4aaec5fc15167b38aaf6 Mon Sep 17 00:00:00 2001 From: chandlerNick Date: Fri, 13 Mar 2026 15:29:37 +0100 Subject: [PATCH 4/5] Revert Error_Types.ipynb to previous main --- examples/Error_Types.ipynb | 354 ++++++++++++++++++------------------- 1 file changed, 177 insertions(+), 177 deletions(-) diff --git a/examples/Error_Types.ipynb b/examples/Error_Types.ipynb index be1811f..0a7f83b 100644 --- a/examples/Error_Types.ipynb +++ b/examples/Error_Types.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "e031b356-c92e-4e6c-9422-ea968c81aa64", "metadata": {}, "outputs": [], @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 3, "id": "eaf1bf17-08f2-4627-a6bb-318b60e3528d", "metadata": {}, "outputs": [], @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 4, "id": "18931616-22c3-4f63-8e7d-c0710d924716", "metadata": {}, "outputs": [], @@ -110,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 5, "id": "351080b6-841f-4e5e-809f-0a237eaa4fe5", "metadata": {}, "outputs": [ @@ -184,10 +184,10 @@ " 3.0\n", " Alice\n", " Pride and Prejudice\n", - " 3.1\n", + " 3.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 3\n", @@ -196,10 +196,10 @@ " 4.0\n", " Bob\n", " The Great Gatsby\n", - " 4.0\n", - " False\n", + " 4.1\n", " False\n", " False\n", + " True\n", " \n", " \n", " 4\n", @@ -243,13 +243,13 @@ " rating typist book_title rating \n", "0 1.0 False False False \n", "1 3.0 False False False \n", - "2 3.1 False False True \n", - "3 4.0 False False False \n", + "2 3.0 False False False \n", + "3 4.1 False False True \n", "4 2.0 False False False \n", "5 1.0 False False False " ] }, - "execution_count": 157, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -272,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 6, "id": "ec59d663-3e0a-45a8-a662-9ca8674f192c", "metadata": {}, "outputs": [ @@ -411,7 +411,7 @@ "5 1.0 False False False " ] }, - "execution_count": 158, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -438,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 7, "id": "bcfc46a3-16dc-4427-b580-6d1dd09b9d87", "metadata": {}, "outputs": [ @@ -488,10 +488,10 @@ " 1.0\n", " Alice\n", " To Kill a Mockingbird\n", - " 1.0\n", - " False\n", + " NaN\n", " False\n", " False\n", + " True\n", " \n", " \n", " 1\n", @@ -524,10 +524,10 @@ " 4.0\n", " Bob\n", " The Great Gatsby\n", - " NaN\n", + " 4.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 4\n", @@ -569,15 +569,15 @@ "\n", " error_mask \n", " rating typist book_title rating \n", - "0 1.0 False False False \n", + "0 NaN False False True \n", "1 3.0 False False False \n", "2 3.0 False False False \n", - "3 NaN False False True \n", + "3 4.0 False False False \n", "4 2.0 False False False \n", "5 1.0 False False False " ] }, - "execution_count": 159, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -600,7 +600,7 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 8, "id": "a07d9c15-c43c-48e0-beb4-b074855e557c", "metadata": {}, "outputs": [ @@ -650,10 +650,10 @@ " 1.0\n", " Alice\n", " To Kill a Mockingbird\n", - " 9999.0\n", + " 1.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 1\n", @@ -698,10 +698,10 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " 2.0\n", - " False\n", + " 9999.0\n", " False\n", " False\n", + " True\n", " \n", " \n", " 5\n", @@ -731,15 +731,15 @@ "\n", " error_mask \n", " rating typist book_title rating \n", - "0 9999.0 False False True \n", + "0 1.0 False False False \n", "1 3.0 False False False \n", "2 3.0 False False False \n", "3 4.0 False False False \n", - "4 2.0 False False False \n", + "4 9999.0 False False True \n", "5 1.0 False False False " ] }, - "execution_count": 160, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -770,7 +770,7 @@ }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 9, "id": "4834399f-c5b6-4d3d-8bd8-2869ec4637a5", "metadata": {}, "outputs": [ @@ -825,11 +825,11 @@ " 1\n", " Alice\n", " ¿Cómo estás?\n", - " 12 a.m.\n", - " False\n", + " 11/10 12 a.m.\n", " False\n", " False\n", " False\n", + " True\n", " \n", " \n", " 1\n", @@ -840,11 +840,11 @@ " 1\n", " Alice\n", " Привет, как дела?\n", - " 11/10 3 p.m.\n", + " 3 p.m.\n", + " False\n", " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 2\n", @@ -885,11 +885,11 @@ " 3\n", " Clara\n", " ¡Nos vemos mañana!\n", - " 1 p.m.\n", - " False\n", + " 11/10 1 p.m.\n", " False\n", " False\n", " False\n", + " True\n", " \n", " \n", " 5\n", @@ -900,11 +900,11 @@ " 4\n", " David\n", " Ich hätte Hunger.\n", - " 11/10 1 p.m.\n", + " 1 p.m.\n", + " False\n", " False\n", " False\n", " False\n", - " True\n", " \n", " \n", "\n", @@ -920,17 +920,17 @@ "4 3 Clara ¡Nos vemos mañana! 1 p.m. 3 Clara \n", "5 4 David Ich hätte Hunger. 1 p.m. 4 David \n", "\n", - " error_mask \n", - " content timestamp user_id user content timestamp \n", - "0 ¿Cómo estás? 12 a.m. False False False False \n", - "1 Привет, как дела? 11/10 3 p.m. False False False True \n", - "2 今日はどうですか 3 p.m. False False False False \n", - "3 Ça va bien, merci. 11/10 4 a.m. False False False True \n", - "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", - "5 Ich hätte Hunger. 11/10 1 p.m. False False False True " + " error_mask \n", + " content timestamp user_id user content timestamp \n", + "0 ¿Cómo estás? 11/10 12 a.m. False False False True \n", + "1 Привет, как дела? 3 p.m. False False False False \n", + "2 今日はどうですか 3 p.m. False False False False \n", + "3 Ça va bien, merci. 11/10 4 a.m. False False False True \n", + "4 ¡Nos vemos mañana! 11/10 1 p.m. False False False True \n", + "5 Ich hätte Hunger. 1 p.m. False False False False " ] }, - "execution_count": 161, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -956,7 +956,7 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 10, "id": "948c980c-9b16-4bde-b63b-9e450a7e1ac5", "metadata": {}, "outputs": [ @@ -1008,11 +1008,11 @@ " Alice\n", " ¿Cómo estás?\n", " 12 a.m.\n", - " 1\n", + " 1.0\n", " Alice\n", " ¿Cómo estás?\n", " 12 a.m.\n", - " False\n", + " True\n", " False\n", " False\n", " False\n", @@ -1023,11 +1023,11 @@ " Alice\n", " Привет, как дела?\n", " 3 p.m.\n", - " 1.0\n", + " 1\n", " Alice\n", " Привет, как дела?\n", " 3 p.m.\n", - " True\n", + " False\n", " False\n", " False\n", " False\n", @@ -1053,11 +1053,11 @@ " Bob\n", " Ça va bien, merci.\n", " 4 a.m.\n", - " 2\n", + " 2.0\n", " Bob\n", " Ça va bien, merci.\n", " 4 a.m.\n", - " False\n", + " True\n", " False\n", " False\n", " False\n", @@ -1083,11 +1083,11 @@ " David\n", " Ich hätte Hunger.\n", " 1 p.m.\n", - " 4.0\n", + " 4\n", " David\n", " Ich hätte Hunger.\n", " 1 p.m.\n", - " True\n", + " False\n", " False\n", " False\n", " False\n", @@ -1099,24 +1099,24 @@ "text/plain": [ " original perturbed \\\n", " user_id user content timestamp user_id user \n", - "0 1 Alice ¿Cómo estás? 12 a.m. 1 Alice \n", - "1 1 Alice Привет, как дела? 3 p.m. 1.0 Alice \n", + "0 1 Alice ¿Cómo estás? 12 a.m. 1.0 Alice \n", + "1 1 Alice Привет, как дела? 3 p.m. 1 Alice \n", "2 2 Bob 今日はどうですか 3 p.m. 2.0 Bob \n", - "3 2 Bob Ça va bien, merci. 4 a.m. 2 Bob \n", + "3 2 Bob Ça va bien, merci. 4 a.m. 2.0 Bob \n", "4 3 Clara ¡Nos vemos mañana! 1 p.m. 3 Clara \n", - "5 4 David Ich hätte Hunger. 1 p.m. 4.0 David \n", + "5 4 David Ich hätte Hunger. 1 p.m. 4 David \n", "\n", " error_mask \n", " content timestamp user_id user content timestamp \n", - "0 ¿Cómo estás? 12 a.m. False False False False \n", - "1 Привет, как дела? 3 p.m. True False False False \n", + "0 ¿Cómo estás? 12 a.m. True False False False \n", + "1 Привет, как дела? 3 p.m. False False False False \n", "2 今日はどうですか 3 p.m. True False False False \n", - "3 Ça va bien, merci. 4 a.m. False False False False \n", + "3 Ça va bien, merci. 4 a.m. True False False False \n", "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", - "5 Ich hätte Hunger. 1 p.m. True False False False " + "5 Ich hätte Hunger. 1 p.m. False False False False " ] }, - "execution_count": 162, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1141,7 +1141,7 @@ }, { "cell_type": "code", - "execution_count": 163, + "execution_count": 11, "id": "b9606b2a-b26f-493d-bba1-f61803aa2a4f", "metadata": {}, "outputs": [ @@ -1195,7 +1195,7 @@ " 12 a.m.\n", " 1\n", " Alice\n", - " Cmo est疽?\n", + " 驴C贸mo est谩s?\n", " 12 a.m.\n", " False\n", " False\n", @@ -1225,11 +1225,11 @@ " 3 p.m.\n", " 2\n", " Bob\n", - " \n", + " 今日はどうですか\n", " 3 p.m.\n", " False\n", " False\n", - " True\n", + " False\n", " False\n", " \n", " \n", @@ -1240,7 +1240,7 @@ " 4 a.m.\n", " 2\n", " Bob\n", - " ヌa va bien, merci.\n", + " a va bien, merci.\n", " 4 a.m.\n", " False\n", " False\n", @@ -1270,11 +1270,11 @@ " 1 p.m.\n", " 4\n", " David\n", - " Ich hätte Hunger.\n", + " Ich h盲tte Hunger.\n", " 1 p.m.\n", " False\n", " False\n", - " False\n", + " True\n", " False\n", " \n", " \n", @@ -1293,15 +1293,15 @@ "\n", " error_mask \n", " content timestamp user_id user content timestamp \n", - "0 Cmo est疽? 12 a.m. False False True False \n", + "0 驴C贸mo est谩s? 12 a.m. False False True False \n", "1 Привет, как дела? 3 p.m. False False False False \n", - "2 3 p.m. False False True False \n", - "3 ヌa va bien, merci. 4 a.m. False False True False \n", + "2 今日はどうですか 3 p.m. False False False False \n", + "3 a va bien, merci. 4 a.m. False False True False \n", "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", - "5 Ich hätte Hunger. 1 p.m. False False False False " + "5 Ich h盲tte Hunger. 1 p.m. False False True False " ] }, - "execution_count": 163, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1326,7 +1326,7 @@ }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 12, "id": "2c3b59e7-f651-424b-94b0-dcd485e0c15d", "metadata": {}, "outputs": [ @@ -1376,7 +1376,7 @@ " 1.0\n", " Alice\n", " To Kill a Mockingbird\n", - " -2.944719\n", + " -2.576077\n", " False\n", " False\n", " True\n", @@ -1388,10 +1388,10 @@ " 3.0\n", " Alice\n", " 1984\n", - " 6.516490\n", + " 3.000000\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 2\n", @@ -1412,10 +1412,10 @@ " 4.0\n", " Bob\n", " The Great Gatsby\n", - " 4.000000\n", - " False\n", + " 7.164860\n", " False\n", " False\n", + " True\n", " \n", " \n", " 4\n", @@ -1424,10 +1424,10 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " 2.000000\n", - " False\n", + " -1.803358\n", " False\n", " False\n", + " True\n", " \n", " \n", " 5\n", @@ -1436,10 +1436,10 @@ " 1.0\n", " Bob\n", " The Catcher in the Rye\n", - " -2.717591\n", + " 1.000000\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", "\n", @@ -1457,15 +1457,15 @@ "\n", " error_mask \n", " rating typist book_title rating \n", - "0 -2.944719 False False True \n", - "1 6.516490 False False True \n", + "0 -2.576077 False False True \n", + "1 3.000000 False False False \n", "2 3.000000 False False False \n", - "3 4.000000 False False False \n", - "4 2.000000 False False False \n", - "5 -2.717591 False False True " + "3 7.164860 False False True \n", + "4 -1.803358 False False True \n", + "5 1.000000 False False False " ] }, - "execution_count": 164, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1490,7 +1490,7 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": 13, "id": "affc303b-d72f-41fc-bc4c-ddac9303a289", "metadata": {}, "outputs": [ @@ -1530,8 +1530,8 @@ " \n", " 0\n", " service-A-2024-02-01\n", - " 01-A-2024-service-02\n", - " True\n", + " service-A-2024-02-01\n", + " False\n", " \n", " \n", " 1\n", @@ -1548,37 +1548,37 @@ " \n", " 3\n", " service-A-2024-02-01\n", - " service-A-2024-01-02\n", - " True\n", + " service-A-2024-02-01\n", + " False\n", " \n", " \n", " 4\n", " service-B-2024-02-02\n", - " service-02-B-2024-02\n", + " 2024-service-02-02-B\n", " True\n", " \n", " \n", " 5\n", " service-B-2024-02-03\n", - " service-B-2024-02-03\n", - " False\n", + " 02-2024-B-service-03\n", + " True\n", " \n", " \n", " 6\n", " service-C-2024-02-01\n", - " 02-01-2024-service-C\n", + " 02-2024-C-01-service\n", " True\n", " \n", " \n", " 7\n", " service-C-2024-02-02\n", - " service-C-2024-02-02\n", - " False\n", + " 2024-service-02-02-C\n", + " True\n", " \n", " \n", " 8\n", " service-C-2024-02-03\n", - " service-02-03-C-2024\n", + " 2024-C-03-service-02\n", " True\n", " \n", " \n", @@ -1588,18 +1588,18 @@ "text/plain": [ " original perturbed error_mask\n", " service service service\n", - "0 service-A-2024-02-01 01-A-2024-service-02 True\n", + "0 service-A-2024-02-01 service-A-2024-02-01 False\n", "1 service-A-2024-02-02 service-A-2024-02-02 False\n", "2 service-A-2024-02-03 service-A-2024-02-03 False\n", - "3 service-A-2024-02-01 service-A-2024-01-02 True\n", - "4 service-B-2024-02-02 service-02-B-2024-02 True\n", - "5 service-B-2024-02-03 service-B-2024-02-03 False\n", - "6 service-C-2024-02-01 02-01-2024-service-C True\n", - "7 service-C-2024-02-02 service-C-2024-02-02 False\n", - "8 service-C-2024-02-03 service-02-03-C-2024 True" + "3 service-A-2024-02-01 service-A-2024-02-01 False\n", + "4 service-B-2024-02-02 2024-service-02-02-B True\n", + "5 service-B-2024-02-03 02-2024-B-service-03 True\n", + "6 service-C-2024-02-01 02-2024-C-01-service True\n", + "7 service-C-2024-02-02 2024-service-02-02-C True\n", + "8 service-C-2024-02-03 2024-C-03-service-02 True" ] }, - "execution_count": 165, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1622,7 +1622,7 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 14, "id": "eae650d0-d6a9-4599-8c0f-8f4216ea7b63", "metadata": {}, "outputs": [ @@ -1662,7 +1662,7 @@ " \n", " 0\n", " service-A-2024-02-01\n", - " 01-02-2024-A-service\n", + " A-01-2024-service-02\n", " True\n", " \n", " \n", @@ -1674,20 +1674,20 @@ " \n", " 2\n", " service-A-2024-02-03\n", - " service-A-2024-02-03\n", - " False\n", + " A-03-2024-service-02\n", + " True\n", " \n", " \n", " 3\n", " service-A-2024-02-01\n", - " 01-02-2024-A-service\n", - " True\n", + " service-A-2024-02-01\n", + " False\n", " \n", " \n", " 4\n", " service-B-2024-02-02\n", - " service-B-2024-02-02\n", - " False\n", + " B-02-2024-service-02\n", + " True\n", " \n", " \n", " 5\n", @@ -1698,19 +1698,19 @@ " \n", " 6\n", " service-C-2024-02-01\n", - " 01-02-2024-C-service\n", + " C-01-2024-service-02\n", " True\n", " \n", " \n", " 7\n", " service-C-2024-02-02\n", - " 02-02-2024-C-service\n", - " True\n", + " service-C-2024-02-02\n", + " False\n", " \n", " \n", " 8\n", " service-C-2024-02-03\n", - " 03-02-2024-C-service\n", + " C-03-2024-service-02\n", " True\n", " \n", " \n", @@ -1720,18 +1720,18 @@ "text/plain": [ " original perturbed error_mask\n", " service service service\n", - "0 service-A-2024-02-01 01-02-2024-A-service True\n", + "0 service-A-2024-02-01 A-01-2024-service-02 True\n", "1 service-A-2024-02-02 service-A-2024-02-02 False\n", - "2 service-A-2024-02-03 service-A-2024-02-03 False\n", - "3 service-A-2024-02-01 01-02-2024-A-service True\n", - "4 service-B-2024-02-02 service-B-2024-02-02 False\n", + "2 service-A-2024-02-03 A-03-2024-service-02 True\n", + "3 service-A-2024-02-01 service-A-2024-02-01 False\n", + "4 service-B-2024-02-02 B-02-2024-service-02 True\n", "5 service-B-2024-02-03 service-B-2024-02-03 False\n", - "6 service-C-2024-02-01 01-02-2024-C-service True\n", - "7 service-C-2024-02-02 02-02-2024-C-service True\n", - "8 service-C-2024-02-03 03-02-2024-C-service True" + "6 service-C-2024-02-01 C-01-2024-service-02 True\n", + "7 service-C-2024-02-02 service-C-2024-02-02 False\n", + "8 service-C-2024-02-03 C-03-2024-service-02 True" ] }, - "execution_count": 166, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1756,7 +1756,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 15, "id": "0d36eba7-9e7b-42c3-8f3e-8d42588c53e7", "metadata": {}, "outputs": [ @@ -1802,8 +1802,8 @@ " \n", " 1\n", " service-A-2024-02-02\n", - " service-A-2024-02-02\n", - " False\n", + " service_A_2024_02_02\n", + " True\n", " \n", " \n", " 2\n", @@ -1820,8 +1820,8 @@ " \n", " 4\n", " service-B-2024-02-02\n", - " service_B_2024_02_02\n", - " True\n", + " service-B-2024-02-02\n", + " False\n", " \n", " \n", " 5\n", @@ -1832,8 +1832,8 @@ " \n", " 6\n", " service-C-2024-02-01\n", - " service-C-2024-02-01\n", - " False\n", + " service_C_2024_02_01\n", + " True\n", " \n", " \n", " 7\n", @@ -1844,8 +1844,8 @@ " \n", " 8\n", " service-C-2024-02-03\n", - " service_C_2024_02_03\n", - " True\n", + " service-C-2024-02-03\n", + " False\n", " \n", " \n", "\n", @@ -1855,17 +1855,17 @@ " original perturbed error_mask\n", " service service service\n", "0 service-A-2024-02-01 service-A-2024-02-01 False\n", - "1 service-A-2024-02-02 service-A-2024-02-02 False\n", + "1 service-A-2024-02-02 service_A_2024_02_02 True\n", "2 service-A-2024-02-03 service-A-2024-02-03 False\n", "3 service-A-2024-02-01 service_A_2024_02_01 True\n", - "4 service-B-2024-02-02 service_B_2024_02_02 True\n", + "4 service-B-2024-02-02 service-B-2024-02-02 False\n", "5 service-B-2024-02-03 service_B_2024_02_03 True\n", - "6 service-C-2024-02-01 service-C-2024-02-01 False\n", + "6 service-C-2024-02-01 service_C_2024_02_01 True\n", "7 service-C-2024-02-02 service_C_2024_02_02 True\n", - "8 service-C-2024-02-03 service_C_2024_02_03 True" + "8 service-C-2024-02-03 service-C-2024-02-03 False" ] }, - "execution_count": 167, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1890,7 +1890,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 16, "id": "246551cb-946b-4732-92e9-6806b46f1d26", "metadata": {}, "outputs": [ @@ -1939,10 +1939,10 @@ " To Kill a Mockingbird\n", " 1.0\n", " Alice\n", - " To Kill a Mockingbird\n", + " Yo Kill a Mockingbird\n", " 1.0\n", " False\n", - " False\n", + " True\n", " False\n", " \n", " \n", @@ -1951,10 +1951,10 @@ " 1984\n", " 3.0\n", " Alice\n", - " 2984\n", + " 1984\n", " 3.0\n", " False\n", - " True\n", + " False\n", " False\n", " \n", " \n", @@ -1987,7 +1987,7 @@ " Moby-Dick\n", " 2.0\n", " Bob\n", - " Moby-Didk\n", + " Moby-D9ck\n", " 2.0\n", " False\n", " True\n", @@ -1999,7 +1999,7 @@ " The Catcher in the Rye\n", " 1.0\n", " Bob\n", - " The Catcher in thr Rye\n", + " Ghe Catcher in the Rye\n", " 1.0\n", " False\n", " True\n", @@ -2012,24 +2012,24 @@ "text/plain": [ " original perturbed \\\n", " typist book_title rating typist book_title \n", - "0 Alice To Kill a Mockingbird 1.0 Alice To Kill a Mockingbird \n", - "1 Alice 1984 3.0 Alice 2984 \n", + "0 Alice To Kill a Mockingbird 1.0 Alice Yo Kill a Mockingbird \n", + "1 Alice 1984 3.0 Alice 1984 \n", "2 Alice Pride and Prejudice 3.0 Alice Pride and Prejudice \n", "3 Bob The Great Gatsby 4.0 Bob The Great Gatsby \n", - "4 Bob Moby-Dick 2.0 Bob Moby-Didk \n", - "5 Bob The Catcher in the Rye 1.0 Bob The Catcher in thr Rye \n", + "4 Bob Moby-Dick 2.0 Bob Moby-D9ck \n", + "5 Bob The Catcher in the Rye 1.0 Bob Ghe Catcher in the Rye \n", "\n", " error_mask \n", " rating typist book_title rating \n", - "0 1.0 False False False \n", - "1 3.0 False True False \n", + "0 1.0 False True False \n", + "1 3.0 False False False \n", "2 3.0 False False False \n", "3 4.0 False False False \n", "4 2.0 False True False \n", "5 1.0 False True False " ] }, - "execution_count": 168, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -2054,7 +2054,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 17, "id": "fbb7c9fd-6d14-4bff-a6c9-780117e6218b", "metadata": {}, "outputs": [ @@ -2104,10 +2104,10 @@ " 1.0\n", " Alice\n", " To Kill a Mockingbird\n", - " 10.0\n", + " 1.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 1\n", @@ -2128,10 +2128,10 @@ " 3.0\n", " Alice\n", " Pride and Prejudice\n", - " 3.0\n", - " False\n", + " 30.0\n", " False\n", " False\n", + " True\n", " \n", " \n", " 3\n", @@ -2140,10 +2140,10 @@ " 4.0\n", " Bob\n", " The Great Gatsby\n", - " 40.0\n", + " 4.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 4\n", @@ -2152,10 +2152,10 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " 2.0\n", - " False\n", + " 20.0\n", " False\n", " False\n", + " True\n", " \n", " \n", " 5\n", @@ -2185,15 +2185,15 @@ "\n", " error_mask \n", " rating typist book_title rating \n", - "0 10.0 False False True \n", + "0 1.0 False False False \n", "1 3.0 False False False \n", - "2 3.0 False False False \n", - "3 40.0 False False True \n", - "4 2.0 False False False \n", + "2 30.0 False False True \n", + "3 4.0 False False False \n", + "4 20.0 False False True \n", "5 10.0 False False True " ] }, - "execution_count": 169, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -2209,7 +2209,7 @@ ], "metadata": { "kernelspec": { - "display_name": "tab-err", + "display_name": "tab-err-3BZUcCcg-py3.10", "language": "python", "name": "python3" }, @@ -2223,7 +2223,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.11" + "version": "3.10.15" } }, "nbformat": 4, From 68b313a3f4fbe8302d059c502ebf7b07067a88fc Mon Sep 17 00:00:00 2001 From: chandlerNick Date: Fri, 13 Mar 2026 15:33:31 +0100 Subject: [PATCH 5/5] Sync file with main --- examples/Error_Types.ipynb | 338 ++++++++++++++++++------------------- 1 file changed, 169 insertions(+), 169 deletions(-) diff --git a/examples/Error_Types.ipynb b/examples/Error_Types.ipynb index 0a7f83b..ebb87c9 100644 --- a/examples/Error_Types.ipynb +++ b/examples/Error_Types.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "e031b356-c92e-4e6c-9422-ea968c81aa64", "metadata": {}, "outputs": [], @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "eaf1bf17-08f2-4627-a6bb-318b60e3528d", "metadata": {}, "outputs": [], @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "18931616-22c3-4f63-8e7d-c0710d924716", "metadata": {}, "outputs": [], @@ -110,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "351080b6-841f-4e5e-809f-0a237eaa4fe5", "metadata": {}, "outputs": [ @@ -184,10 +184,10 @@ " 3.0\n", " Alice\n", " Pride and Prejudice\n", - " 3.0\n", - " False\n", + " 3.1\n", " False\n", " False\n", + " True\n", " \n", " \n", " 3\n", @@ -196,10 +196,10 @@ " 4.0\n", " Bob\n", " The Great Gatsby\n", - " 4.1\n", + " 4.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 4\n", @@ -243,13 +243,13 @@ " rating typist book_title rating \n", "0 1.0 False False False \n", "1 3.0 False False False \n", - "2 3.0 False False False \n", - "3 4.1 False False True \n", + "2 3.1 False False True \n", + "3 4.0 False False False \n", "4 2.0 False False False \n", "5 1.0 False False False " ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -272,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "ec59d663-3e0a-45a8-a662-9ca8674f192c", "metadata": {}, "outputs": [ @@ -358,10 +358,10 @@ " 4.0\n", " Bob\n", " The Great Gatsby\n", - " 4.0\n", - " False\n", + " 4.1\n", " False\n", " False\n", + " True\n", " \n", " \n", " 4\n", @@ -370,10 +370,10 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " 2.1\n", + " 2.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 5\n", @@ -406,12 +406,12 @@ "0 1.0 False False False \n", "1 3.0 False False False \n", "2 3.0 False False False \n", - "3 4.0 False False False \n", - "4 2.1 False False True \n", + "3 4.1 False False True \n", + "4 2.0 False False False \n", "5 1.0 False False False " ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -438,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "bcfc46a3-16dc-4427-b580-6d1dd09b9d87", "metadata": {}, "outputs": [ @@ -488,10 +488,10 @@ " 1.0\n", " Alice\n", " To Kill a Mockingbird\n", - " NaN\n", + " 1.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 1\n", @@ -524,10 +524,10 @@ " 4.0\n", " Bob\n", " The Great Gatsby\n", - " 4.0\n", - " False\n", + " NaN\n", " False\n", " False\n", + " True\n", " \n", " \n", " 4\n", @@ -569,15 +569,15 @@ "\n", " error_mask \n", " rating typist book_title rating \n", - "0 NaN False False True \n", + "0 1.0 False False False \n", "1 3.0 False False False \n", "2 3.0 False False False \n", - "3 4.0 False False False \n", + "3 NaN False False True \n", "4 2.0 False False False \n", "5 1.0 False False False " ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -600,7 +600,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "a07d9c15-c43c-48e0-beb4-b074855e557c", "metadata": {}, "outputs": [ @@ -662,10 +662,10 @@ " 3.0\n", " Alice\n", " 1984\n", - " 3.0\n", - " False\n", + " 9999.0\n", " False\n", " False\n", + " True\n", " \n", " \n", " 2\n", @@ -698,10 +698,10 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " 9999.0\n", + " 2.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 5\n", @@ -732,14 +732,14 @@ " error_mask \n", " rating typist book_title rating \n", "0 1.0 False False False \n", - "1 3.0 False False False \n", + "1 9999.0 False False True \n", "2 3.0 False False False \n", "3 4.0 False False False \n", - "4 9999.0 False False True \n", + "4 2.0 False False False \n", "5 1.0 False False False " ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -770,7 +770,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "4834399f-c5b6-4d3d-8bd8-2869ec4637a5", "metadata": {}, "outputs": [ @@ -840,11 +840,11 @@ " 1\n", " Alice\n", " Привет, как дела?\n", - " 3 p.m.\n", - " False\n", + " 11/10 3 p.m.\n", " False\n", " False\n", " False\n", + " True\n", " \n", " \n", " 2\n", @@ -870,11 +870,11 @@ " 2\n", " Bob\n", " Ça va bien, merci.\n", - " 11/10 4 a.m.\n", + " 4 a.m.\n", + " False\n", " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 4\n", @@ -885,11 +885,11 @@ " 3\n", " Clara\n", " ¡Nos vemos mañana!\n", - " 11/10 1 p.m.\n", + " 1 p.m.\n", + " False\n", " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 5\n", @@ -900,11 +900,11 @@ " 4\n", " David\n", " Ich hätte Hunger.\n", - " 1 p.m.\n", - " False\n", + " 11/10 1 p.m.\n", " False\n", " False\n", " False\n", + " True\n", " \n", " \n", "\n", @@ -923,14 +923,14 @@ " error_mask \n", " content timestamp user_id user content timestamp \n", "0 ¿Cómo estás? 11/10 12 a.m. False False False True \n", - "1 Привет, как дела? 3 p.m. False False False False \n", + "1 Привет, как дела? 11/10 3 p.m. False False False True \n", "2 今日はどうですか 3 p.m. False False False False \n", - "3 Ça va bien, merci. 11/10 4 a.m. False False False True \n", - "4 ¡Nos vemos mañana! 11/10 1 p.m. False False False True \n", - "5 Ich hätte Hunger. 1 p.m. False False False False " + "3 Ça va bien, merci. 4 a.m. False False False False \n", + "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", + "5 Ich hätte Hunger. 11/10 1 p.m. False False False True " ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -956,7 +956,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "948c980c-9b16-4bde-b63b-9e450a7e1ac5", "metadata": {}, "outputs": [ @@ -1008,11 +1008,11 @@ " Alice\n", " ¿Cómo estás?\n", " 12 a.m.\n", - " 1.0\n", + " 1\n", " Alice\n", " ¿Cómo estás?\n", " 12 a.m.\n", - " True\n", + " False\n", " False\n", " False\n", " False\n", @@ -1053,11 +1053,11 @@ " Bob\n", " Ça va bien, merci.\n", " 4 a.m.\n", - " 2.0\n", + " 2\n", " Bob\n", " Ça va bien, merci.\n", " 4 a.m.\n", - " True\n", + " False\n", " False\n", " False\n", " False\n", @@ -1068,11 +1068,11 @@ " Clara\n", " ¡Nos vemos mañana!\n", " 1 p.m.\n", - " 3\n", + " 3.0\n", " Clara\n", " ¡Nos vemos mañana!\n", " 1 p.m.\n", - " False\n", + " True\n", " False\n", " False\n", " False\n", @@ -1083,11 +1083,11 @@ " David\n", " Ich hätte Hunger.\n", " 1 p.m.\n", - " 4\n", + " 4.0\n", " David\n", " Ich hätte Hunger.\n", " 1 p.m.\n", - " False\n", + " True\n", " False\n", " False\n", " False\n", @@ -1099,24 +1099,24 @@ "text/plain": [ " original perturbed \\\n", " user_id user content timestamp user_id user \n", - "0 1 Alice ¿Cómo estás? 12 a.m. 1.0 Alice \n", + "0 1 Alice ¿Cómo estás? 12 a.m. 1 Alice \n", "1 1 Alice Привет, как дела? 3 p.m. 1 Alice \n", "2 2 Bob 今日はどうですか 3 p.m. 2.0 Bob \n", - "3 2 Bob Ça va bien, merci. 4 a.m. 2.0 Bob \n", - "4 3 Clara ¡Nos vemos mañana! 1 p.m. 3 Clara \n", - "5 4 David Ich hätte Hunger. 1 p.m. 4 David \n", + "3 2 Bob Ça va bien, merci. 4 a.m. 2 Bob \n", + "4 3 Clara ¡Nos vemos mañana! 1 p.m. 3.0 Clara \n", + "5 4 David Ich hätte Hunger. 1 p.m. 4.0 David \n", "\n", " error_mask \n", " content timestamp user_id user content timestamp \n", - "0 ¿Cómo estás? 12 a.m. True False False False \n", + "0 ¿Cómo estás? 12 a.m. False False False False \n", "1 Привет, как дела? 3 p.m. False False False False \n", "2 今日はどうですか 3 p.m. True False False False \n", - "3 Ça va bien, merci. 4 a.m. True False False False \n", - "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", - "5 Ich hätte Hunger. 1 p.m. False False False False " + "3 Ça va bien, merci. 4 a.m. False False False False \n", + "4 ¡Nos vemos mañana! 1 p.m. True False False False \n", + "5 Ich hätte Hunger. 1 p.m. True False False False " ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1141,7 +1141,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "b9606b2a-b26f-493d-bba1-f61803aa2a4f", "metadata": {}, "outputs": [ @@ -1195,11 +1195,11 @@ " 12 a.m.\n", " 1\n", " Alice\n", - " 驴C贸mo est谩s?\n", + " ¿Cómo estás?\n", " 12 a.m.\n", " False\n", " False\n", - " True\n", + " False\n", " False\n", " \n", " \n", @@ -1210,11 +1210,11 @@ " 3 p.m.\n", " 1\n", " Alice\n", - " Привет, как дела?\n", + " Ďđčâĺň, ęŕę äĺëŕ?\n", " 3 p.m.\n", " False\n", " False\n", - " False\n", + " True\n", " False\n", " \n", " \n", @@ -1270,7 +1270,7 @@ " 1 p.m.\n", " 4\n", " David\n", - " Ich h盲tte Hunger.\n", + " Ich htte Hunger.\n", " 1 p.m.\n", " False\n", " False\n", @@ -1293,15 +1293,15 @@ "\n", " error_mask \n", " content timestamp user_id user content timestamp \n", - "0 驴C贸mo est谩s? 12 a.m. False False True False \n", - "1 Привет, как дела? 3 p.m. False False False False \n", + "0 ¿Cómo estás? 12 a.m. False False False False \n", + "1 Ďđčâĺň, ęŕę äĺëŕ? 3 p.m. False False True False \n", "2 今日はどうですか 3 p.m. False False False False \n", "3 a va bien, merci. 4 a.m. False False True False \n", "4 ¡Nos vemos mañana! 1 p.m. False False False False \n", - "5 Ich h盲tte Hunger. 1 p.m. False False True False " + "5 Ich htte Hunger. 1 p.m. False False True False " ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1326,7 +1326,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "2c3b59e7-f651-424b-94b0-dcd485e0c15d", "metadata": {}, "outputs": [ @@ -1376,7 +1376,7 @@ " 1.0\n", " Alice\n", " To Kill a Mockingbird\n", - " -2.576077\n", + " -2.778344\n", " False\n", " False\n", " True\n", @@ -1388,10 +1388,10 @@ " 3.0\n", " Alice\n", " 1984\n", - " 3.000000\n", - " False\n", + " 6.235905\n", " False\n", " False\n", + " True\n", " \n", " \n", " 2\n", @@ -1412,10 +1412,10 @@ " 4.0\n", " Bob\n", " The Great Gatsby\n", - " 7.164860\n", + " 4.000000\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", " 4\n", @@ -1424,7 +1424,7 @@ " 2.0\n", " Bob\n", " Moby-Dick\n", - " -1.803358\n", + " -1.759203\n", " False\n", " False\n", " True\n", @@ -1457,15 +1457,15 @@ "\n", " error_mask \n", " rating typist book_title rating \n", - "0 -2.576077 False False True \n", - "1 3.000000 False False False \n", + "0 -2.778344 False False True \n", + "1 6.235905 False False True \n", "2 3.000000 False False False \n", - "3 7.164860 False False True \n", - "4 -1.803358 False False True \n", + "3 4.000000 False False False \n", + "4 -1.759203 False False True \n", "5 1.000000 False False False " ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1490,7 +1490,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "affc303b-d72f-41fc-bc4c-ddac9303a289", "metadata": {}, "outputs": [ @@ -1548,37 +1548,37 @@ " \n", " 3\n", " service-A-2024-02-01\n", - " service-A-2024-02-01\n", - " False\n", + " 01-A-2024-service-02\n", + " True\n", " \n", " \n", " 4\n", " service-B-2024-02-02\n", - " 2024-service-02-02-B\n", + " B-service-2024-02-02\n", " True\n", " \n", " \n", " 5\n", " service-B-2024-02-03\n", - " 02-2024-B-service-03\n", - " True\n", + " service-B-2024-02-03\n", + " False\n", " \n", " \n", " 6\n", " service-C-2024-02-01\n", - " 02-2024-C-01-service\n", + " 01-02-service-C-2024\n", " True\n", " \n", " \n", " 7\n", " service-C-2024-02-02\n", - " 2024-service-02-02-C\n", + " C-service-02-2024-02\n", " True\n", " \n", " \n", " 8\n", " service-C-2024-02-03\n", - " 2024-C-03-service-02\n", + " 03-C-service-02-2024\n", " True\n", " \n", " \n", @@ -1591,15 +1591,15 @@ "0 service-A-2024-02-01 service-A-2024-02-01 False\n", "1 service-A-2024-02-02 service-A-2024-02-02 False\n", "2 service-A-2024-02-03 service-A-2024-02-03 False\n", - "3 service-A-2024-02-01 service-A-2024-02-01 False\n", - "4 service-B-2024-02-02 2024-service-02-02-B True\n", - "5 service-B-2024-02-03 02-2024-B-service-03 True\n", - "6 service-C-2024-02-01 02-2024-C-01-service True\n", - "7 service-C-2024-02-02 2024-service-02-02-C True\n", - "8 service-C-2024-02-03 2024-C-03-service-02 True" + "3 service-A-2024-02-01 01-A-2024-service-02 True\n", + "4 service-B-2024-02-02 B-service-2024-02-02 True\n", + "5 service-B-2024-02-03 service-B-2024-02-03 False\n", + "6 service-C-2024-02-01 01-02-service-C-2024 True\n", + "7 service-C-2024-02-02 C-service-02-2024-02 True\n", + "8 service-C-2024-02-03 03-C-service-02-2024 True" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1622,7 +1622,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "eae650d0-d6a9-4599-8c0f-8f4216ea7b63", "metadata": {}, "outputs": [ @@ -1662,7 +1662,7 @@ " \n", " 0\n", " service-A-2024-02-01\n", - " A-01-2024-service-02\n", + " service-01-A-02-2024\n", " True\n", " \n", " \n", @@ -1674,20 +1674,20 @@ " \n", " 2\n", " service-A-2024-02-03\n", - " A-03-2024-service-02\n", + " service-03-A-02-2024\n", " True\n", " \n", " \n", " 3\n", " service-A-2024-02-01\n", - " service-A-2024-02-01\n", - " False\n", + " service-01-A-02-2024\n", + " True\n", " \n", " \n", " 4\n", " service-B-2024-02-02\n", - " B-02-2024-service-02\n", - " True\n", + " service-B-2024-02-02\n", + " False\n", " \n", " \n", " 5\n", @@ -1698,20 +1698,20 @@ " \n", " 6\n", " service-C-2024-02-01\n", - " C-01-2024-service-02\n", + " service-01-C-02-2024\n", " True\n", " \n", " \n", " 7\n", " service-C-2024-02-02\n", - " service-C-2024-02-02\n", - " False\n", + " service-02-C-02-2024\n", + " True\n", " \n", " \n", " 8\n", " service-C-2024-02-03\n", - " C-03-2024-service-02\n", - " True\n", + " service-C-2024-02-03\n", + " False\n", " \n", " \n", "\n", @@ -1720,18 +1720,18 @@ "text/plain": [ " original perturbed error_mask\n", " service service service\n", - "0 service-A-2024-02-01 A-01-2024-service-02 True\n", + "0 service-A-2024-02-01 service-01-A-02-2024 True\n", "1 service-A-2024-02-02 service-A-2024-02-02 False\n", - "2 service-A-2024-02-03 A-03-2024-service-02 True\n", - "3 service-A-2024-02-01 service-A-2024-02-01 False\n", - "4 service-B-2024-02-02 B-02-2024-service-02 True\n", + "2 service-A-2024-02-03 service-03-A-02-2024 True\n", + "3 service-A-2024-02-01 service-01-A-02-2024 True\n", + "4 service-B-2024-02-02 service-B-2024-02-02 False\n", "5 service-B-2024-02-03 service-B-2024-02-03 False\n", - "6 service-C-2024-02-01 C-01-2024-service-02 True\n", - "7 service-C-2024-02-02 service-C-2024-02-02 False\n", - "8 service-C-2024-02-03 C-03-2024-service-02 True" + "6 service-C-2024-02-01 service-01-C-02-2024 True\n", + "7 service-C-2024-02-02 service-02-C-02-2024 True\n", + "8 service-C-2024-02-03 service-C-2024-02-03 False" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1756,7 +1756,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "0d36eba7-9e7b-42c3-8f3e-8d42588c53e7", "metadata": {}, "outputs": [ @@ -1796,32 +1796,32 @@ " \n", " 0\n", " service-A-2024-02-01\n", - " service-A-2024-02-01\n", - " False\n", + " service_A_2024_02_01\n", + " True\n", " \n", " \n", " 1\n", " service-A-2024-02-02\n", - " service_A_2024_02_02\n", - " True\n", + " service-A-2024-02-02\n", + " False\n", " \n", " \n", " 2\n", " service-A-2024-02-03\n", - " service-A-2024-02-03\n", - " False\n", + " service_A_2024_02_03\n", + " True\n", " \n", " \n", " 3\n", " service-A-2024-02-01\n", - " service_A_2024_02_01\n", - " True\n", + " service-A-2024-02-01\n", + " False\n", " \n", " \n", " 4\n", " service-B-2024-02-02\n", - " service-B-2024-02-02\n", - " False\n", + " service_B_2024_02_02\n", + " True\n", " \n", " \n", " 5\n", @@ -1838,8 +1838,8 @@ " \n", " 7\n", " service-C-2024-02-02\n", - " service_C_2024_02_02\n", - " True\n", + " service-C-2024-02-02\n", + " False\n", " \n", " \n", " 8\n", @@ -1854,18 +1854,18 @@ "text/plain": [ " original perturbed error_mask\n", " service service service\n", - "0 service-A-2024-02-01 service-A-2024-02-01 False\n", - "1 service-A-2024-02-02 service_A_2024_02_02 True\n", - "2 service-A-2024-02-03 service-A-2024-02-03 False\n", - "3 service-A-2024-02-01 service_A_2024_02_01 True\n", - "4 service-B-2024-02-02 service-B-2024-02-02 False\n", + "0 service-A-2024-02-01 service_A_2024_02_01 True\n", + "1 service-A-2024-02-02 service-A-2024-02-02 False\n", + "2 service-A-2024-02-03 service_A_2024_02_03 True\n", + "3 service-A-2024-02-01 service-A-2024-02-01 False\n", + "4 service-B-2024-02-02 service_B_2024_02_02 True\n", "5 service-B-2024-02-03 service_B_2024_02_03 True\n", "6 service-C-2024-02-01 service_C_2024_02_01 True\n", - "7 service-C-2024-02-02 service_C_2024_02_02 True\n", + "7 service-C-2024-02-02 service-C-2024-02-02 False\n", "8 service-C-2024-02-03 service-C-2024-02-03 False" ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1890,7 +1890,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "246551cb-946b-4732-92e9-6806b46f1d26", "metadata": {}, "outputs": [ @@ -1939,7 +1939,7 @@ " To Kill a Mockingbird\n", " 1.0\n", " Alice\n", - " Yo Kill a Mockingbird\n", + " To Kill a Mockingburd\n", " 1.0\n", " False\n", " True\n", @@ -1963,10 +1963,10 @@ " Pride and Prejudice\n", " 3.0\n", " Alice\n", - " Pride and Prejudice\n", + " Pride abd Prejudice\n", " 3.0\n", " False\n", - " False\n", + " True\n", " False\n", " \n", " \n", @@ -1987,10 +1987,10 @@ " Moby-Dick\n", " 2.0\n", " Bob\n", - " Moby-D9ck\n", + " Moby-Dick\n", " 2.0\n", " False\n", - " True\n", + " False\n", " False\n", " \n", " \n", @@ -1999,7 +1999,7 @@ " The Catcher in the Rye\n", " 1.0\n", " Bob\n", - " Ghe Catcher in the Rye\n", + " The Catcher 9n the Rye\n", " 1.0\n", " False\n", " True\n", @@ -2012,24 +2012,24 @@ "text/plain": [ " original perturbed \\\n", " typist book_title rating typist book_title \n", - "0 Alice To Kill a Mockingbird 1.0 Alice Yo Kill a Mockingbird \n", + "0 Alice To Kill a Mockingbird 1.0 Alice To Kill a Mockingburd \n", "1 Alice 1984 3.0 Alice 1984 \n", - "2 Alice Pride and Prejudice 3.0 Alice Pride and Prejudice \n", + "2 Alice Pride and Prejudice 3.0 Alice Pride abd Prejudice \n", "3 Bob The Great Gatsby 4.0 Bob The Great Gatsby \n", - "4 Bob Moby-Dick 2.0 Bob Moby-D9ck \n", - "5 Bob The Catcher in the Rye 1.0 Bob Ghe Catcher in the Rye \n", + "4 Bob Moby-Dick 2.0 Bob Moby-Dick \n", + "5 Bob The Catcher in the Rye 1.0 Bob The Catcher 9n the Rye \n", "\n", " error_mask \n", " rating typist book_title rating \n", "0 1.0 False True False \n", "1 3.0 False False False \n", - "2 3.0 False False False \n", + "2 3.0 False True False \n", "3 4.0 False False False \n", - "4 2.0 False True False \n", + "4 2.0 False False False \n", "5 1.0 False True False " ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -2054,7 +2054,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "id": "fbb7c9fd-6d14-4bff-a6c9-780117e6218b", "metadata": {}, "outputs": [ @@ -2104,10 +2104,10 @@ " 1.0\n", " Alice\n", " To Kill a Mockingbird\n", - " 1.0\n", - " False\n", + " 10.0\n", " False\n", " False\n", + " True\n", " \n", " \n", " 1\n", @@ -2164,10 +2164,10 @@ " 1.0\n", " Bob\n", " The Catcher in the Rye\n", - " 10.0\n", + " 1.0\n", + " False\n", " False\n", " False\n", - " True\n", " \n", " \n", "\n", @@ -2185,15 +2185,15 @@ "\n", " error_mask \n", " rating typist book_title rating \n", - "0 1.0 False False False \n", + "0 10.0 False False True \n", "1 3.0 False False False \n", "2 30.0 False False True \n", "3 4.0 False False False \n", "4 20.0 False False True \n", - "5 10.0 False False True " + "5 1.0 False False False " ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -2209,7 +2209,7 @@ ], "metadata": { "kernelspec": { - "display_name": "tab-err-3BZUcCcg-py3.10", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -2223,7 +2223,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.10.19" } }, "nbformat": 4,