admin管理员组文章数量:1022989
I'll create a demo dataframe to recreate the error that I see in databricks.
from pyspark.sql.types import StructType, StructField, TimestampType, StringType
from datetime import datetime
# Define the schema
schema = StructType([
StructField("session_ts", TimestampType(), True),
StructField("analysis_ts", TimestampType(), True)
])
# Define the data with datetime objects
data = [
(datetime(2023, 9, 15, 17, 30, 41), datetime(2023, 9, 15, 17, 47, 3)),
(datetime(2023, 10, 24, 18, 23, 37), datetime(2023, 10, 24, 18, 25, 16)),
(datetime(2024, 1, 15, 6, 38, 52), datetime(2024, 1, 15, 6, 48, 15)),
(datetime(2024, 2, 21, 13, 16, 37), datetime(2024, 2, 21, 13, 22, 35)),
(datetime(2023, 10, 18, 17, 52, 28), datetime(2023, 10, 19, 17, 11, 3))
]
# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)
When I try to convert the pyspark dataframe to pandas I get the error: TypeError: Casting to unit-less dtype 'datetime64' is not supported. Pass e.g. 'datetime64[ns]' instead.
df.toPandas().head()
Casting the fields as TimestampType did not resolve the error.
df = df.withColumn("session_ts", df["session_ts"].cast(TimestampType()))
df = df.withColumn("analysis_ts", df["analysis_ts"].cast(TimestampType()))
df.toPandas()
I was only able to proceed by casting as string, which seems an uneccessary workaround.
df = df.withColumn("session_ts", df["session_ts"].cast(StringType()))
df = df.withColumn("analysis_ts", df["analysis_ts"].cast(StringType()))
df.toPandas()
I'll create a demo dataframe to recreate the error that I see in databricks.
from pyspark.sql.types import StructType, StructField, TimestampType, StringType
from datetime import datetime
# Define the schema
schema = StructType([
StructField("session_ts", TimestampType(), True),
StructField("analysis_ts", TimestampType(), True)
])
# Define the data with datetime objects
data = [
(datetime(2023, 9, 15, 17, 30, 41), datetime(2023, 9, 15, 17, 47, 3)),
(datetime(2023, 10, 24, 18, 23, 37), datetime(2023, 10, 24, 18, 25, 16)),
(datetime(2024, 1, 15, 6, 38, 52), datetime(2024, 1, 15, 6, 48, 15)),
(datetime(2024, 2, 21, 13, 16, 37), datetime(2024, 2, 21, 13, 22, 35)),
(datetime(2023, 10, 18, 17, 52, 28), datetime(2023, 10, 19, 17, 11, 3))
]
# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)
When I try to convert the pyspark dataframe to pandas I get the error: TypeError: Casting to unit-less dtype 'datetime64' is not supported. Pass e.g. 'datetime64[ns]' instead.
df.toPandas().head()
Casting the fields as TimestampType did not resolve the error.
df = df.withColumn("session_ts", df["session_ts"].cast(TimestampType()))
df = df.withColumn("analysis_ts", df["analysis_ts"].cast(TimestampType()))
df.toPandas()
I was only able to proceed by casting as string, which seems an uneccessary workaround.
df = df.withColumn("session_ts", df["session_ts"].cast(StringType()))
df = df.withColumn("analysis_ts", df["analysis_ts"].cast(StringType()))
df.toPandas()
Share
Improve this question
asked Nov 18, 2024 at 20:51
JoeJoe
3,8164 gold badges23 silver badges48 bronze badges
1 Answer
Reset to default 01) Ensure datetime64[ns] During Conversion
import pyspark.sql.functions as F
Explicitly cast timestamps to ensure compatibility
df = df.withColumn("session_ts", F.col("session_ts").cast("timestamp")) df = df.withColumn("analysis_ts", F.col("analysis_ts").cast("timestamp"))
Convert to pandas
pdf = df.toPandas() print(pdf.head())
2) Disable PyArrow for Conversion (Fallback to Legacy Conversion)
Disable PyArrow during the conversion
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")
Convert to pandas
pdf = df.toPandas() print(pdf.head())
I'll create a demo dataframe to recreate the error that I see in databricks.
from pyspark.sql.types import StructType, StructField, TimestampType, StringType
from datetime import datetime
# Define the schema
schema = StructType([
StructField("session_ts", TimestampType(), True),
StructField("analysis_ts", TimestampType(), True)
])
# Define the data with datetime objects
data = [
(datetime(2023, 9, 15, 17, 30, 41), datetime(2023, 9, 15, 17, 47, 3)),
(datetime(2023, 10, 24, 18, 23, 37), datetime(2023, 10, 24, 18, 25, 16)),
(datetime(2024, 1, 15, 6, 38, 52), datetime(2024, 1, 15, 6, 48, 15)),
(datetime(2024, 2, 21, 13, 16, 37), datetime(2024, 2, 21, 13, 22, 35)),
(datetime(2023, 10, 18, 17, 52, 28), datetime(2023, 10, 19, 17, 11, 3))
]
# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)
When I try to convert the pyspark dataframe to pandas I get the error: TypeError: Casting to unit-less dtype 'datetime64' is not supported. Pass e.g. 'datetime64[ns]' instead.
df.toPandas().head()
Casting the fields as TimestampType did not resolve the error.
df = df.withColumn("session_ts", df["session_ts"].cast(TimestampType()))
df = df.withColumn("analysis_ts", df["analysis_ts"].cast(TimestampType()))
df.toPandas()
I was only able to proceed by casting as string, which seems an uneccessary workaround.
df = df.withColumn("session_ts", df["session_ts"].cast(StringType()))
df = df.withColumn("analysis_ts", df["analysis_ts"].cast(StringType()))
df.toPandas()
I'll create a demo dataframe to recreate the error that I see in databricks.
from pyspark.sql.types import StructType, StructField, TimestampType, StringType
from datetime import datetime
# Define the schema
schema = StructType([
StructField("session_ts", TimestampType(), True),
StructField("analysis_ts", TimestampType(), True)
])
# Define the data with datetime objects
data = [
(datetime(2023, 9, 15, 17, 30, 41), datetime(2023, 9, 15, 17, 47, 3)),
(datetime(2023, 10, 24, 18, 23, 37), datetime(2023, 10, 24, 18, 25, 16)),
(datetime(2024, 1, 15, 6, 38, 52), datetime(2024, 1, 15, 6, 48, 15)),
(datetime(2024, 2, 21, 13, 16, 37), datetime(2024, 2, 21, 13, 22, 35)),
(datetime(2023, 10, 18, 17, 52, 28), datetime(2023, 10, 19, 17, 11, 3))
]
# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)
When I try to convert the pyspark dataframe to pandas I get the error: TypeError: Casting to unit-less dtype 'datetime64' is not supported. Pass e.g. 'datetime64[ns]' instead.
df.toPandas().head()
Casting the fields as TimestampType did not resolve the error.
df = df.withColumn("session_ts", df["session_ts"].cast(TimestampType()))
df = df.withColumn("analysis_ts", df["analysis_ts"].cast(TimestampType()))
df.toPandas()
I was only able to proceed by casting as string, which seems an uneccessary workaround.
df = df.withColumn("session_ts", df["session_ts"].cast(StringType()))
df = df.withColumn("analysis_ts", df["analysis_ts"].cast(StringType()))
df.toPandas()
Share
Improve this question
asked Nov 18, 2024 at 20:51
JoeJoe
3,8164 gold badges23 silver badges48 bronze badges
1 Answer
Reset to default 01) Ensure datetime64[ns] During Conversion
import pyspark.sql.functions as F
Explicitly cast timestamps to ensure compatibility
df = df.withColumn("session_ts", F.col("session_ts").cast("timestamp")) df = df.withColumn("analysis_ts", F.col("analysis_ts").cast("timestamp"))
Convert to pandas
pdf = df.toPandas() print(pdf.head())
2) Disable PyArrow for Conversion (Fallback to Legacy Conversion)
Disable PyArrow during the conversion
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")
Convert to pandas
pdf = df.toPandas() print(pdf.head())
本文标签:
版权声明:本文标题:pyspark - Error converting spark dataframe to pandas: TypeError: Casting to unit-less dtype 'datetime64' is not 内容由热心网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://it.en369.cn/questions/1745595110a2158119.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论