1   
 2   
 3   
 4   
 5   
 6   
 7   
 8   
 9   
10   
11   
12   
13   
14   
15   
16   
17   
18  """ 
19  PySpark is the Python API for Spark. 
20   
21  Public classes: 
22   
23    - L{SparkContext<pyspark.context.SparkContext>} 
24        Main entry point for Spark functionality. 
25    - L{RDD<pyspark.rdd.RDD>} 
26        A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. 
27    - L{Broadcast<pyspark.broadcast.Broadcast>} 
28        A broadcast variable that gets reused across tasks. 
29    - L{Accumulator<pyspark.accumulators.Accumulator>} 
30        An "add-only" shared variable that tasks can only add values to. 
31    - L{SparkConf<pyspark.conf.SparkConf>} 
32        For configuring Spark. 
33    - L{SparkFiles<pyspark.files.SparkFiles>} 
34        Access files shipped with jobs. 
35    - L{StorageLevel<pyspark.storagelevel.StorageLevel>} 
36        Finer-grained cache persistence levels. 
37  """ 
38   
39   
40   
41  import sys 
42  import os 
43  sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "python/lib/py4j-0.8.1-src.zip")) 
44   
45   
46  from pyspark.conf import SparkConf 
47  from pyspark.context import SparkContext 
48  from pyspark.rdd import RDD 
49  from pyspark.files import SparkFiles 
50  from pyspark.storagelevel import StorageLevel 
51   
52   
53  __all__ = ["SparkConf", "SparkContext", "RDD", "SparkFiles", "StorageLevel"] 
54