Example of processing and analyzing big data using Spark in Python
from pyspark.sql import SparkSession
# Create a SparkSession object
spark = SparkSession.builder.appName("BigData").getOrCreate()
# Load the data from a CSV file
data = spark.read.csv("data.csv", header=True)
# Select the relevant columns
columns = ["column1", "column2", "column3"]
data = data.select(columns)
# Filter the data based on a condition
data = data.filter(data.column1 > 0)
# Group the data by a column and aggregate the values
data = data.groupBy("column2").agg({"column3": "sum"})
# Write the processed data to a file
data.write.csv("output.csv", header=True)
No comments:
Post a Comment