Hello
I'm trying to replicate your example in my own project.
But I have an issue with python udf: always run into this error ModuleNotFoundError: No module named 'pipelines'
I simply changed your code as is:
amazon.py:
# Example ETL with no parameters - see etl() function
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, current_timestamp
from pipelines.utils import transformations, configmanagement as cm
from pyspark.sql.types import StructType, StructField, DoubleType, StringType
spark = SparkSession.builder.getOrCreate()
def extract_Amazon(filePath):
return spark.read.format("parquet").load(filePath)
def transform_Amazon(df):
df = df.withColumn("meta_timestamp", lit(current_timestamp()))
df = transformations.addDummyColumn(df)
return df
def load_Amazon(df):
spark.sql("DROP TABLE IF EXISTS amazon")
df.write.format("parquet").mode("overwrite").saveAsTable("amazon")
return
def addone(df):
df['price'] = df['price'] + 1
return df
def etl():
df = extract_Amazon("/databricks-datasets/amazon/test4K")
df = transform_Amazon(df)
schema = StructType([
StructField("brand", StringType(), True),
StructField("price", DoubleType(), True),
])
df = df.select('brand', 'price').groupBy("brand").applyInPandas(transformations.addone, schema)
print(df.show())
# load_Amazon(df)
and it gives me this error:
pyspark.sql.utils.PythonException: An exception was thrown from a UDF: 'pyspark.serializers.SerializationError: Caused by Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 165, in _read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 469, in loads
return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'pipelines''. Full traceback below:
Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 165, in _read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 469, in loads
return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'pipelines'
During handling of the above exception, another exception occurred:
pyspark.serializers.SerializationError: Caused by Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 165, in _read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 469, in loads
return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'pipelines'
Any idea how to solve this ?
Hello
I'm trying to replicate your example in my own project.
But I have an issue with python udf: always run into this error
ModuleNotFoundError: No module named 'pipelines'I simply changed your code as is:
amazon.py:
and it gives me this error:
Any idea how to solve this ?