admin管理员组文章数量:1418411
docker-compose.yml
services:
namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
container_name: namenode
restart: always
ports:
- 9870:9870
- 9000:9000
environment:
- CLUSTER_NAME=hadoop_cluster
- CORE_CONF_fs_defaultFS=hdfs://namenode:9000
volumes:
- hadoop_namenode:/hadoop/dfs/name
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9870"]
interval: 30s
timeout: 10s
retries: 3
datanode:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
container_name: datanode
restart: always
ports:
- 9864:9864
environment:
- CORE_CONF_fs_defaultFS=hdfs://namenode:9000
volumes:
- hadoop_datanode:/hadoop/dfs/data
depends_on:
- namenode
resourcemanager:
image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8
container_name: resourcemanager
restart: always
ports:
- 8088:8088
environment:
- CORE_CONF_fs_defaultFS=hdfs://namenode:9000
- YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
depends_on:
- namenode
nodemanager:
image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8
container_name: nodemanager
restart: always
ports:
- 8042:8042
environment:
- CORE_CONF_fs_defaultFS=hdfs://namenode:9000
- YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
- YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle
depends_on:
- resourcemanager
spark-master:
image: bitnami/spark:3.5
container_name: spark-master
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
ports:
- "8080:8080"
- "7077:7077"
volumes:
- spark_data:/bitnami/spark
spark-worker:
image: bitnami/spark:3.5
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_CORES=1
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
# ports:
# - "8081-8084:8081"
volumes:
- spark_data:/bitnami/spark
depends_on:
- spark-master
deploy:
replicas: 2
jupyter:
image: jupyter/pyspark-notebook:latest
container_name: jupyter
ports:
- "8888:8888"
environment:
- JUPYTER_ENABLE_LAB=yes
- SPARK_MASTER=spark://spark-master:7077
volumes:
- jupyter_data:/home/jovyan/work
- ./notebooks:/home/jovyan/work/notebooks
depends_on:
- spark-master
volumes:
hadoop_namenode:
hadoop_datanode:
spark_data:
jupyter_data:
It's fine when I try to create spark dataframe then write to hdfs and read back from hdfs via spark-shell
First all, I create a directory test in hdfs then set permissions to 777 access
hdfs command
hdfs dfs -mkdir /test/employees
hdfs dfs -chmod -R 777 /test
spark-shell
// First, create a dummy dataframe
val data = Seq(
(1, "John", 30, "New York"),
(2, "Alice", 25, "San Francisco"),
(3, "Bob", 35, "Chicago"),
(4, "Carol", 28, "Boston"),
(5, "David", 40, "Seattle")
)
// Define the schema
val columns = Seq("id", "name", "age", "city")
// Create the DataFrame
val df = spark.createDataFrame(data).toDF(columns: _*)
// Show the DataFrame
df.show()
// Write to HDFS
df.write
.mode("overwrite")
.parquet("hdfs://namenode:9000/test/employees")
// Read back from HDFS
val dfRead = spark.read
.parquet("hdfs://namenode:9000/test/employees")
// Show the read DataFrame
println("\nData read back from HDFS:")
dfRead.show()
// Perform some basic analysis
println("\nBasic statistics:")
dfRead.describe("age").show()
println("\nCount by city:")
dfRead.groupBy("city").count().show()
But When I try to do same thing in jupyter it can not create spark dataframe even though spark session successfully connected.
Install pyspark package
pip install pyspark
jupyter notebook
from pyspark.sql import SparkSession
# Create Spark session with proper configuration
spark = SparkSession.builder \
.appName("JupyterTest") \
.master("spark://spark-master:7077") \
.config("spark.driver.host", "jupyter") \
.config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
.getOrCreate()
# Create test DataFrame
data = [("John", 30), ("Alice", 25), ("Bob", 35)]
df = spark.createDataFrame(data, ["name", "age"])
# Show DataFrame
print("Original DataFrame:")
df.show()
# Write to HDFS
df.write.mode("overwrite").parquet("hdfs://namenode:9000/test/people")
# Read back from HDFS
df_read = spark.read.parquet("hdfs://namenode:9000/test/people")
print("\nData read from HDFS:")
df_read.show()
Is something misssing about my config spark session?
本文标签: hadoopConnect spark session on JupyterStack Overflow
版权声明:本文标题:hadoop - Connect spark session on Jupyter - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1745274902a2651123.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论