diff --git a/terraform/examples/README.md b/terraform/examples/README.md index 2c6912bbd..0e114eff2 100644 --- a/terraform/examples/README.md +++ b/terraform/examples/README.md @@ -3,5 +3,3 @@ The examples included here are designed to introduce specific features and provide a basic learning experience. The examples subdirectory is automatically provisioned into the home directory of the VMs in your cloud environment. - -- [Spark Integration](spark/README.md) diff --git a/terraform/examples/spark/README.md b/terraform/examples/spark/README.md deleted file mode 100644 index dca2a3992..000000000 --- a/terraform/examples/spark/README.md +++ /dev/null @@ -1,193 +0,0 @@ -# Nomad / Spark integration - -The Nomad ecosystem includes a fork of Apache Spark that natively supports using -a Nomad cluster to run Spark applications. When running on Nomad, the Spark -executors that run Spark tasks for your application, and optionally the -application driver itself, run as Nomad tasks in a Nomad job. See the -[usage guide](./RunningSparkOnNomad.pdf) for more details. - -Clusters provisioned with Nomad's Terraform templates are automatically -configured to run the Spark integration. The sample job files found here are -also provisioned onto every client and server. - -## Setup - -To give the Spark integration a test drive, provision a cluster and SSH to any -one of the clients or servers (the public IPs are displayed when the Terraform -provisioning process completes): - -```bash -$ ssh -i /path/to/key ubuntu@PUBLIC_IP -``` - -The Spark history server and several of the sample Spark jobs below require -HDFS. Using the included job file, deploy an HDFS cluster on Nomad: - -```bash -$ cd $HOME/examples/spark -$ nomad run hdfs.nomad -$ nomad status hdfs -``` - -When the allocations are all in the `running` state (as shown by `nomad status -hdfs`), query Consul to verify that the HDFS service has been registered: - -```bash -$ dig hdfs.service.consul -``` - -Next, create directories and files in HDFS for use by the history server and the -sample Spark jobs: - -```bash -$ hdfs dfs -mkdir /foo -$ hdfs dfs -put /var/log/apt/history.log /foo -$ hdfs dfs -mkdir /spark-events -$ hdfs dfs -ls / -``` - -Finally, deploy the Spark history server: - -```bash -$ nomad run spark-history-server-hdfs.nomad -``` - -You can get the private IP for the history server with a Consul DNS lookup: - -```bash -$ dig spark-history.service.consul -``` - -Cross-reference the private IP with the `terraform apply` output to get the -corresponding public IP. You can access the history server at -`http://PUBLIC_IP:18080`. - -## Sample Spark jobs - -The sample `spark-submit` commands listed below demonstrate several of the -official Spark examples. Features like `spark-sql`, `spark-shell` and `pyspark` -are included. The commands can be executed from any client or server. - -You can monitor the status of a Spark job in a second terminal session with: - -```bash -$ nomad status -$ nomad status JOB_ID -$ nomad alloc-status DRIVER_ALLOC_ID -$ nomad logs DRIVER_ALLOC_ID -``` - -To view the output of the job, run `nomad logs` for the driver's Allocation ID. - -### SparkPi (Java) - -```bash -spark-submit \ - --class org.apache.spark.examples.JavaSparkPi \ - --master nomad \ - --deploy-mode cluster \ - --conf spark.executor.instances=4 \ - --conf spark.nomad.cluster.monitorUntil=complete \ - --conf spark.eventLog.enabled=true \ - --conf spark.eventLog.dir=hdfs://hdfs.service.consul/spark-events \ - --conf spark.nomad.sparkDistribution=https://nomad-spark.s3.amazonaws.com/spark-2.1.0-bin-nomad.tgz \ - https://nomad-spark.s3.amazonaws.com/spark-examples_2.11-2.1.0-SNAPSHOT.jar 100 -``` - -### Word count (Java) - -```bash -spark-submit \ - --class org.apache.spark.examples.JavaWordCount \ - --master nomad \ - --deploy-mode cluster \ - --conf spark.executor.instances=4 \ - --conf spark.nomad.cluster.monitorUntil=complete \ - --conf spark.eventLog.enabled=true \ - --conf spark.eventLog.dir=hdfs://hdfs.service.consul/spark-events \ - --conf spark.nomad.sparkDistribution=https://nomad-spark.s3.amazonaws.com/spark-2.1.0-bin-nomad.tgz \ - https://nomad-spark.s3.amazonaws.com/spark-examples_2.11-2.1.0-SNAPSHOT.jar \ - hdfs://hdfs.service.consul/foo/history.log -``` - -### DFSReadWriteTest (Scala) - -```bash -spark-submit \ - --class org.apache.spark.examples.DFSReadWriteTest \ - --master nomad \ - --deploy-mode cluster \ - --conf spark.executor.instances=4 \ - --conf spark.nomad.cluster.monitorUntil=complete \ - --conf spark.eventLog.enabled=true \ - --conf spark.eventLog.dir=hdfs://hdfs.service.consul/spark-events \ - --conf spark.nomad.sparkDistribution=https://nomad-spark.s3.amazonaws.com/spark-2.1.0-bin-nomad.tgz \ - https://nomad-spark.s3.amazonaws.com/spark-examples_2.11-2.1.0-SNAPSHOT.jar \ - /etc/sudoers hdfs://hdfs.service.consul/foo -``` - -### spark-shell - -Start the shell: - -```bash -spark-shell \ - --master nomad \ - --conf spark.executor.instances=4 \ - --conf spark.nomad.sparkDistribution=https://nomad-spark.s3.amazonaws.com/spark-2.1.0-bin-nomad.tgz -``` - -Run a few commands: - -```bash -$ spark.version - -$ val data = 1 to 10000 -$ val distData = sc.parallelize(data) -$ distData.filter(_ < 10).collect() -``` - -### sql-shell - -Start the shell: - -```bash -spark-sql \ - --master nomad \ - --conf spark.executor.instances=4 \ - --conf spark.nomad.sparkDistribution=https://nomad-spark.s3.amazonaws.com/spark-2.1.0-bin-nomad.tgz jars/spark-sql_2.11-2.1.0-SNAPSHOT.jar -``` - -Run a few commands: - -```bash -$ CREATE TEMPORARY VIEW usersTable -USING org.apache.spark.sql.parquet -OPTIONS ( - path "/usr/local/bin/spark/examples/src/main/resources/users.parquet" -); - -$ SELECT * FROM usersTable; -``` - -### pyspark - -Start the shell: - -```bash -pyspark \ - --master nomad \ - --conf spark.executor.instances=4 \ - --conf spark.nomad.sparkDistribution=https://nomad-spark.s3.amazonaws.com/spark-2.1.0-bin-nomad.tgz -``` - -Run a few commands: - -```bash -$ df = spark.read.json("/usr/local/bin/spark/examples/src/main/resources/people.json") -$ df.show() -$ df.printSchema() -$ df.createOrReplaceTempView("people") -$ sqlDF = spark.sql("SELECT * FROM people") -$ sqlDF.show() -``` diff --git a/terraform/examples/spark/RunningSparkOnNomad.pdf b/terraform/examples/spark/RunningSparkOnNomad.pdf deleted file mode 100644 index f8faad517..000000000 --- a/terraform/examples/spark/RunningSparkOnNomad.pdf +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:40cbfa2171b1e36ced02e191913536475da97e1ecfc0aee5d7847545bc33a7c3 -size 239696 diff --git a/terraform/examples/spark/docker/hdfs/Dockerfile b/terraform/examples/spark/docker/hdfs/Dockerfile deleted file mode 100644 index 1a6a7cc35..000000000 --- a/terraform/examples/spark/docker/hdfs/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM openjdk:7 - -ENV HADOOP_VERSION 2.7.3 - -RUN wget -O - http://apache.mirror.iphh.net/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz | tar xz -C /usr/local/ -ENV HADOOP_PREFIX /usr/local/hadoop-$HADOOP_VERSION -ENV PATH $PATH:$HADOOP_PREFIX/bin - -COPY core-site.xml $HADOOP_PREFIX/etc/hadoop/ diff --git a/terraform/examples/spark/docker/hdfs/core-site.xml b/terraform/examples/spark/docker/hdfs/core-site.xml deleted file mode 100644 index 75360fcc2..000000000 --- a/terraform/examples/spark/docker/hdfs/core-site.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - fs.defaultFS - hdfs://hdfs.service.consul/ - - diff --git a/terraform/examples/spark/docker/spark/Dockerfile b/terraform/examples/spark/docker/spark/Dockerfile deleted file mode 100644 index 00e9d19a7..000000000 --- a/terraform/examples/spark/docker/spark/Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM openjdk:7-jre - -RUN curl https://spark-nomad.s3.amazonaws.com/spark-2.1.1-bin-nomad.tgz | tar -xzC /tmp -RUN mv /tmp/spark* /opt/spark - -ENV SPARK_HOME /opt/spark -ENV PATH $PATH:$SPARK_HOME/bin diff --git a/terraform/examples/spark/hdfs.nomad b/terraform/examples/spark/hdfs.nomad deleted file mode 100644 index 35ddfb4e7..000000000 --- a/terraform/examples/spark/hdfs.nomad +++ /dev/null @@ -1,91 +0,0 @@ -job "hdfs" { - - datacenters = ["dc1"] - - group "NameNode" { - - constraint { - operator = "distinct_hosts" - value = "true" - } - - task "NameNode" { - - driver = "docker" - - config { - image = "rcgenova/hadoop-2.7.3" - command = "bash" - args = ["-c", "hdfs namenode -format && exec hdfs namenode -D fs.defaultFS=hdfs://${NOMAD_ADDR_ipc}/ -D dfs.permissions.enabled=false"] - network_mode = "host" - port_map { - ipc = 8020 - ui = 50070 - } - } - - resources { - memory = 500 - network { - port "ipc" { - static = "8020" - } - port "ui" { - static = "50070" - } - } - } - - service { - name = "hdfs" - port = "ipc" - } - } - } - - group "DataNode" { - - count = 3 - - constraint { - operator = "distinct_hosts" - value = "true" - } - - task "DataNode" { - - driver = "docker" - - config { - network_mode = "host" - image = "rcgenova/hadoop-2.7.3" - args = ["hdfs", "datanode" - , "-D", "fs.defaultFS=hdfs://hdfs.service.consul/" - , "-D", "dfs.permissions.enabled=false" - ] - port_map { - data = 50010 - ipc = 50020 - ui = 50075 - } - } - - resources { - memory = 500 - network { - port "data" { - static = "50010" - } - port "ipc" { - static = "50020" - } - port "ui" { - static = "50075" - } - } - } - - } - } - -} diff --git a/terraform/examples/spark/spark-history-server-hdfs.nomad b/terraform/examples/spark/spark-history-server-hdfs.nomad deleted file mode 100644 index e7293a549..000000000 --- a/terraform/examples/spark/spark-history-server-hdfs.nomad +++ /dev/null @@ -1,45 +0,0 @@ -job "spark-history-server" { - datacenters = ["dc1"] - type = "service" - - group "server" { - count = 1 - - task "history-server" { - driver = "docker" - - config { - image = "barnardb/spark" - command = "/spark/spark-2.1.0-bin-nomad/bin/spark-class" - args = ["org.apache.spark.deploy.history.HistoryServer"] - port_map { - ui = 18080 - } - network_mode = "host" - } - - env { - "SPARK_HISTORY_OPTS" = "-Dspark.history.fs.logDirectory=hdfs://hdfs.service.consul/spark-events/" - "SPARK_PUBLIC_DNS" = "spark-history.service.consul" - } - - resources { - cpu = 500 - memory = 500 - network { - mbits = 250 - port "ui" { - static = 18080 - } - } - } - - service { - name = "spark-history" - tags = ["spark", "ui"] - port = "ui" - } - } - - } -}