Session 54 - Full Outer Join in PySpark - Joining over one Column
emp_data = [
(1,"Person1",1),
(2,"Person2",2),
(3,"Person3",1),
(4,"Person4",1),
(5,"Person5",6),
(6,"Person6",4),
(7,"Person6",2),
(8,"Person8",3)
]
department_data = [
(1,"IT"),
(2,"HR"),
(3,"DE"),
(4,"BE"),
(5,"FE")
]
emp_data = [
(1,"Person1",1),
(2,"Person2",2),
(3,"Person3",1),
(4,"Person4",1),
(5,"Person5",6),
(6,"Person6",4),
(7,"Person6",2),
(8,"Person8",3)
]
department_data = [
(1,"IT"),
(2,"HR"),
(3,"DE"),
(4,"BE"),
(5,"FE")
]
from pyspark.sql.types import *
from pyspark.sql import functions as F
emp_schema = StructType([
StructField("emp_id",IntegerType()),
StructField("emp_name",StringType()),
StructField("dept_id",IntegerType()),
])
dept_schema = StructType([
StructField("department_id",IntegerType()),
StructField("department_name",StringType())
])
emp_df = spark.createDataFrame(emp_data,emp_schema)
dept_df = spark.createDataFrame(department_data,dept_schema)
emp_df.display()
dept_df.display()
full_outer_join_df = emp_df.join(dept_df,emp_df.dept_id == dept_df.department_id,"full")
full_outer_join_df.display()
full_outer_join_df1 = emp_df.join(dept_df,emp_df.dept_id == dept_df.department_id,"fullouter")
full_outer_join_df1.display()
full_outer_join_df2 = emp_df.join(dept_df,emp_df.dept_id == dept_df.department_id,"full_outer")
full_outer_join_df2.display()
emp_df.createOrReplaceTempView("employee")
dept_df.createOrReplaceTempView("department")
spark.sql("SELECT * FROM employee FULL JOIN department ON employee.dept_id=department.department_id").display()
spark.sql("SELECT * FROM employee FULL OUTER JOIN department ON employee.dept_id=department.department_id").display()
#pyspark #apachespark #databricks #coding #learnpyspark #python #azuredatabrickswithpyspark #vlog #viralvideo
Информация по комментариям в разработке