Hello Everyone,
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, quarter, year, sum as _sum, when
Define sales data
data = [
("2010-01-02", 500),
("2010-02-03", 1000),
("2010-03-04", 1000),
("2010-04-05", 1000),
("2010-05-06", 1500),
("2010-06-07", 1000),
("2010-07-08", 1000),
("2010-08-09", 1000),
("2011-10-10", 1000),
("2011-01-02", 500),
("2011-02-03", 1000),
("2011-03-04", 1000),
("2011-04-05", 1000),
("2011-05-06", 1550),
("2011-06-07", 1100),
("2011-07-08", 1100),
("2011-08-09", 1000),
]
Define schema for the sales data
schema = ["date", "sales"]
Create DataFrame
df = spark.createDataFrame(data, schema)
Extract year and quarter from date
df_with_quarters = (
df.withColumn("year", year(col("date")))
.withColumn("quarter", quarter(col("date")))
)
df_with_quarters.show()
Aggregate sales data by year and quarter
quarterly_sales = (
df_with_quarters.groupBy("year", "quarter")
.agg(_sum("sales").alias("total_sales"))
)
quarterly_sales.show()
Pivot data to separate Q1 and Q2 sales
pivoted_sales = (
quarterly_sales.groupBy("year")
.pivot("quarter", [1, 2])
.agg(_sum("total_sales"))
.withColumnRenamed("1", "Q1_sales")
.withColumnRenamed("2", "Q2_sales")
)
pivoted_sales.show()
Calculate percentage difference between Q1 and Q2
result = (
pivoted_sales.withColumn(
"percentage_difference",
when(
(col("Q1_sales").isNotNull()) & (col("Q2_sales").isNotNull()),
((col("Q2_sales") - col("Q1_sales")) / col("Q1_sales")) * 100,
).otherwise(None),
)
)
Show the result
result.show()
This series is for beginners and intermediate level candidates who wants to crack PySpark interviews
Here is the link to the course : https://www.geekcoders.co.in/courses/...
#pyspark #interviewquestions #interview #pysparkinterview #dataengineer #aws #databricks #python
Информация по комментариям в разработке