Puppet Class: cdh::spark

Defined in:
modules/cdh/manifests/spark.pp

Overview

Class cdh::spark

Installs spark set up to work in YARN mode. You should include this on your client nodes. This does not need to be on all worker nodes.

Parameters

$master_host - If set, Spark will be configured to work in standalone mode,

rather than in YARN.  Include cdh::spark::master on this host
and cdh::spark::worker on all standalone Spark Worker nodes.
Default: undef

$worker_cores - Number of cores to allocate per spark worker.

This is only used in standalone mode.  Default: undef ($::processorcount)

$worker_memory - Total amount of memory workers are allowed to use on a node.

This is only used in standalone mode.  Default:  undef ($::memorysize_mb - 1024)

$worker_instances - Number of worker instances to run on a node. Note that $worker_cores

will apply to each worker.  If you increase this, make sure to
make $worker_cores smaller appropriately.
This is only used in standalone mode.  Default: undef (1)

$daemon_memory - Memory to allocate to the Spark master and worker daemons themselves.

This is only used in standalone mode.  Default: undef (512m)

$dynamic_allocation_enabled - If set, Spark will be configured to use Dynamic Resource Allocation.

This is only available in YARN mode. Default: true

$dynamic_allocation_executor_idle_timeout - Corresponds to the related Spark Dynamic Resource Allocation timeout setting.

This is only available in YARN mode. Default: '60s'

$dynamic_allocation_cached_executor_idle_timeout - Corresponds to the related Spark Dynamic Resource Allocation timeout setting

This is only available in YARN mode. Default: '3600s'

$use_kerberos - Use Kerberos authentication to create HDFS directories.

Parameters:

  • master_host (Any) (defaults to: undef)
  • worker_cores (Any) (defaults to: undef)
  • worker_memory (Any) (defaults to: undef)
  • worker_instances (Any) (defaults to: undef)
  • daemon_memory (Any) (defaults to: undef)
  • dynamic_allocation_enabled (Any) (defaults to: true)
  • dynamic_allocation_executor_idle_timeout (Any) (defaults to: '60s')
  • dynamic_allocation_cached_executor_idle_timeout (Any) (defaults to: '3600s')
  • use_kerberos (Any) (defaults to: false)


37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'modules/cdh/manifests/spark.pp', line 37

class cdh::spark(
    $master_host                                     = undef,
    $worker_cores                                    = undef,
    $worker_memory                                   = undef,
    $worker_instances                                = undef,
    $daemon_memory                                   = undef,
    $dynamic_allocation_enabled                      = true,
    $dynamic_allocation_executor_idle_timeout        = '60s',
    $dynamic_allocation_cached_executor_idle_timeout = '3600s',
    $use_kerberos                                    = false,
)
{
    # Spark requires Hadoop configs installed.
    Class['cdh::hadoop'] -> Class['cdh::spark']

    # If $standalone_master_host was set,
    # then we will be configuring a standalone spark cluster.
    $standalone_enabled = $master_host ? {
        undef   => false,
        default => true,
    }

    if $standalone_enabled and $dynamic_allocation_enabled {
        fail('Spark Dynamic Resource Allocation is only available in YARN mode.')
    }

    package { ['spark-core', 'spark-python']:
        ensure => 'installed',
    }

    $config_directory = "/etc/spark/conf.${cdh::hadoop::cluster_name}"
    # Create the $cluster_name based $config_directory.
    file { $config_directory:
        ensure  => 'directory',
        require => Package['spark-core'],
    }
    cdh::alternative { 'spark-conf':
        link => '/etc/spark/conf',
        path => $config_directory,
    }

    # Only need to ensure these directories once.
    # TODO: In default YARN mode, how to make sure we only check these directories from one puppet host?
    if !$standalone_enabled or $master_host == $::fqdn {
        # sudo -u hdfs hdfs dfs -mkdir /user/spark
        # sudo -u hdfs hdfs dfs -chmod 0775 /user/spark
        # sudo -u hdfs hdfs dfs -chown spark:spark /user/spark
        cdh::hadoop::directory { '/user/spark':
            owner        => 'spark',
            group        => 'spark',
            mode         => '0755',
            use_kerberos => $use_kerberos,
            require      => Package['spark-core'],
        }

        cdh::hadoop::directory { '/user/spark/share':
            owner        => 'spark',
            group        => 'spark',
            mode         => '0755',
            use_kerberos => $use_kerberos,
            require      => Cdh::Hadoop::Directory['/user/spark'],

        }
        cdh::hadoop::directory { '/user/spark/share/lib':
            owner        => 'spark',
            group        => 'spark',
            mode         => '0755',
            use_kerberos => $use_kerberos,
            require      => Cdh::Hadoop::Directory['/user/spark/share'],
        }

        cdh::hadoop::directory { ['/user/spark/applicationHistory']:
            owner        => 'spark',
            group        => 'spark',
            mode         => '1777',
            use_kerberos => $use_kerberos,
            require      => Cdh::Hadoop::Directory['/user/spark'],
        }
    }

    $namenode_address = $::cdh::hadoop::ha_enabled ? {
        true    => $cdh::hadoop::nameservice_id,
        default => $cdh::hadoop::primary_namenode_host,
    }

    if !$standalone_enabled {
        # Put Spark assembly jar into HDFS so that it d
        # doesn't have to be loaded for each spark job submission.

        $spark_jar_hdfs_path = "hdfs://${namenode_address}/user/spark/share/lib/spark-assembly.jar"
        kerberos::exec { 'spark_assembly_jar_install':
            command      => "/usr/bin/hdfs dfs -put -f /usr/lib/spark/lib/spark-assembly.jar ${spark_jar_hdfs_path}",
            unless       => '/usr/bin/hdfs dfs -ls /user/spark/share/lib/spark-assembly.jar | grep -q /user/spark/share/lib/spark-assembly.jar',
            user         => 'spark',
            require      => Cdh::Hadoop::Directory['/user/spark/share/lib'],
            before       => [
                File["${config_directory}/spark-env.sh"],
                File["${config_directory}/spark-defaults.conf"]
            ],
            timeout      => 60,
            use_kerberos => $use_kerberos,
        }
    }

    file { "${config_directory}/spark-env.sh":
        content => template('cdh/spark/spark-env.sh.erb'),
    }

    file { "${config_directory}/spark-defaults.conf":
        content => template('cdh/spark/spark-defaults.conf.erb'),
    }

    file { "${config_directory}/log4j.properties":
        source => 'puppet:///modules/cdh/spark/log4j.properties',
    }

    $hive_site_symlink_ensure = defined(Class['cdh::hive']) ? {
        true    => 'link',
        default => 'absent'
    }

    file { "${config_directory}/hive-site.xml":
        ensure => $hive_site_symlink_ensure,
        target => "${::cdh::hive::config_directory}/hive-site.xml",
    }
}