package cn.edu360.day8 import org.apache.spark.sql.{DataFrame, SparkSession} /** * Created by zx on 2017/10/16. */ object JoinTest { def main(args: Array[String]): Unit = { val spark = SparkSession.builder().appName("CsvDataSource") .master("local[*]") .getOrCreate() import spark.implicits._ //import org.apache.spark.sql.functions._ //spark.sql.autoBroadcastJoinThreshold=-1 spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) //spark.conf.set("spark.sql.join.preferSortMergeJoin", true) //println(spark.conf.get("spark.sql.autoBroadcastJoinThreshold")) val df1 = Seq( (0, "playing"), (1, "with"), (2, "join") ).toDF("id", "token") val df2 = Seq( (0, "P"), (1, "W"), (2, "S") ).toDF("aid", "atoken") df2.repartition() //df1.cache().count() val result: DataFrame = df1.join(df2, $"id" === $"aid") //查看执行计划 result.explain() result.show() spark.stop() } }
spark.conf.set(“spark.sql.autoBroadcastJoinThreshold”, -1)
Broadcast Join默认是10M -1代表不使用,可以修改-1的值
merge join 先进行每个表的排序,然后join
hashShuffle join 对数值取hash到不同分区进行join操作(默认)