Spark with HDFS

Example

<configuration>
<property>
   <name>fs.defaultFS</name>
   <value>hdfs://master:9000</value>
</property>
...
<configuration>

pyspark

read from hdfs

lines = sc.textFile("hdfs://master:9000/user/input/sample.txt")
lines.count()

write into hdfs

rowdata = sc.parallelize([[1,2],[3,4]])
rowdata.map(lambda row: row[0] + row[1])\
  .saveAsTextFile("hdfs://master:9000/user/input/example")

hadoop fs -cat hdfs://master:9000/user/input/example/part-00000