#1. 启动
spark shell
#2. 系统 把SparkContext 赋值为sc变量
scala> sc
res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@2981064d
#3. 加载文件赋值为rawRdd变量
scala> val rawRdd = sc.textFile("/data/tmp/word.txt")
rawRdd: org.apache.spark.rdd.RDD[String] = /data/tmp/words.txt MapPartitionsRDD[1] at textFile at <console>:24
#4. first算账在 查看第一行数据
scala> rawRdd.first
res2: String = bout yun
#5. collect算子 rawRdd :查看 内容 ,如果内容比较大,则不能使用collect ,搞崩内存
scala> rawRdd.collect
res3: Array[String] = Array(bout yun, hello word, hello spark)
#6. 查看血缘关系 rawRdd.toDebugString
res6: String =
(2) /data/tmp/word.txt MapPartitionsRDD[1] at textFile at <console>:24 []
| /data/tmp/word.txt HadoopRDD[0] at textFile at <console>:24 []
#7. 转换 transaction
scala> val words = rawRdd.flatMap(line => line.split("\\s+"))
words: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[2] at flatMap at <console>:25
scala> words.toDebugString
res8: String =
(2) MapPartitionsRDD[2] at flatMap at <console>:25 []
| /data/tmp/word.txt MapPartitionsRDD[1] at textFile at <console>:24 []
| /data/tmp/word.txt HadoopRDD[0] at textFile at <console>:24 []
#8. 拆分
scala> words.collect
res9: Array[String] = Array(bout, yun, hello, word, hello, spark)
scala> val words_one = words.map(word=>(word,1))
words_one: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[3] at map at <console>:25
scala> words_one.collect
res10: Array[(String, Int)] = Array((bout,1), (yun,1), (hello,1), (word,1), (hello,1), (spark,1))
#9. 计数 (Action 操作)
scala> words_one.reduceByKey(_+_).collect
res12: Array[(String, Int)] = Array((yun,1), (bout,1), (word,1), (hello,2), (spark,1))
#10. 血缘关系
scala> words_one.reduceByKey(_+_).toDebugString
res13: String =
(2) ShuffledRDD[6] at reduceByKey at <console>:26 []
+-(2) MapPartitionsRDD[3] at map at <console>:25 []
| MapPartitionsRDD[2] at flatMap at <console>:25 []
| /data/tmp/word.txt MapPartitionsRDD[1] at textFile at <console>:24 []
| /data/tmp/word.txt HadoopRDD[0] at textFile at <console>:24 []