- Download
- Install
- chmod +x Anaconda2-5.0.1-Linux-x86_64.sh
- sudo bash Anaconda2-5.0.1-Linux-x86_64.sh
- Secure
- source .bashrc
- jupyter notebook --generate-config
- jupyter notebook password
- IP
- vim .jupyter/jupyter_notebook_config.py
- #c.NotebookApp.ip = 'localhost' → c.NotebookApp.ip = '*'
- PySpark
- vim .bashr
- Add the following contents
- sudo su -
- mkdir -p /usr/local/share/jupyter/kernels/pyspark
- chown -R test_user:test_user jupyter/
- exit
- vim /usr/local/share/jupyter/kernels/pyspark/kernel.json
- Add the following contents
- Run
- jupyter notebook
Monday, December 4, 2017
Spark and Jupyter
Tuesday, October 10, 2017
Specify the name of output file using key and remove it from output file in spark
- Old hadoop API
- New hadoop API
importorg.apache.hadoop.io.{NullWritable, Text}importorg.apache.hadoop.mapreduce._importorg.apache.hadoop.mapreduce.lib.output.{LazyOutputFormat, MultipleOutputs, TextOutputFormat}importorg.apache.hadoop.mapreduce.task.MapContextImplclassTextMultipleOutputsFormatextendsTextOutputFormat[Text, Text] {overridedefgetRecordWriter(context:TaskAttemptContext):RecordWriter[Text, Text]=newRecordWriter[Text, Text] {valjob:Job=Job.getInstance(context.getConfiguration)valmoContext=newMapContextImpl(job.getConfiguration, context.getTaskAttemptID,null,newDummyRecordWriter,null,null,null)valmultipleOutputs=newMultipleOutputs[NullWritable, Text](moContext)LazyOutputFormat.setOutputFormatClass(job, classOf[TextOutputFormat[_,_]])overridedefwrite(key:Text, value:Text):Unit={multipleOutputs.write(NullWritable.get, value, key.toString)}overridedefclose(context:TaskAttemptContext):Unit=multipleOutputs.close()}privateclassDummyRecordWriterextendsRecordWriter[NullWritable, Text] {overridedefwrite(key:NullWritable, value:Text):Unit=()overridedefclose(context:TaskAttemptContext):Unit=()}}rdd.saveAsNewAPIHadoopFile[TextMultipleOutputsFormat](outputPath) - Spark API
rdd.toDF("file","data").write.partitionBy("file").text(outputPath)
Subscribe to:
Comments (Atom)