Puppet Class: dumps::web::cleanups::xmldumps

Defined in:
modules/dumps/manifests/web/cleanups/xmldumps.pp

Overview

Parameters:

  • xmldumpsdir (Any) (defaults to: undef)
  • dumpstempdir (Any) (defaults to: undef)
  • user (Any) (defaults to: undef)
  • isreplica (Any) (defaults to: undef)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'modules/dumps/manifests/web/cleanups/xmldumps.pp', line 1

class dumps::web::cleanups::xmldumps(
    $xmldumpsdir = undef,
    $dumpstempdir = undef,
    $user = undef,
    $isreplica = undef,
) {
    $wikilist_dir = '/etc/dumps/dblists'
    file { $wikilist_dir:
        ensure => 'directory',
        path   => $wikilist_dir,
        mode   => '0755',
        owner  => 'root',
        group  => 'root',
    }

    # these lists are used only to decide how many dumps of
    # each type of wiki we keep.
    $bigwikis = ['dewiki', 'eswiki', 'frwiki', 'itwiki', 'jawiki',
                  'metawiki', 'nlwiki', 'plwiki', 'ptwiki', 'ruwiki',
                  'commonswiki', 'svwiki', 'zhwiki', 'kowiki']
    $bigwikis_dblist = join($bigwikis, "\n")

    $hugewikis = ['enwiki', 'wikidatawiki']
    $hugewikis_dblist = join($hugewikis, "\n")

    file { "${wikilist_dir}/hugewikis.dblist":
        ensure  => 'present',
        path    => "${wikilist_dir}/hugewikis.dblist",
        mode    => '0644',
        owner   => 'root',
        group   => 'root',
        content => "${hugewikis_dblist}\n",
    }

    file { "${wikilist_dir}/bigwikis.dblist":
        ensure  => 'present',
        path    => "${wikilist_dir}/bigwikis.dblist",
        mode    => '0644',
        owner   => 'root',
        group   => 'root',
        content => "${bigwikis_dblist}\n",
    }

    # how many dumps we keep of each type. in practice we keep one
    # less, so that when a new dump run starts and partial dumps are
    # copied over to the web server, space is available for that new
    # run BEFORE it is copied.

    # on generator nfs hosts we must keep a minimum of 3 so that at any time
    # we have at least one old full dump around, with all revision content
    # which can be stolen from for the next dump run.  This is due to
    # the way we run dumps: one full run, then one run without full
    # revision content, etc.
    # we also need to keep partials of 2 dumps, for prefetch purposes,
    # just in case there's an issue with the last full run.
    $keep_generator = ['hugewikis.dblist:3:2', 'bigwikis.dblist:3:2', 'default:3:2']
    $keep_replicas = ['hugewikis.dblist:7', 'bigwikis.dblist:8', 'default:10']

    if ($isreplica == true) {
        $content= join($keep_replicas, "\n")
    } else {
        $content= join($keep_generator, "\n")
    }

    file { '/etc/dumps/xml_keeps.conf':
        ensure  => 'present',
        path    => '/etc/dumps/xml_keeps.conf',
        mode    => '0644',
        owner   => 'root',
        group   => 'root',
        content => "${content}\n",
    }

    # set up the file containing expressions to match dump output
    # files we need to keep around, for those dumps we don't remove
    # completely, on the dumps generator nfs hosts.
    if ($isreplica == false) {
        $patternslist = ['.*-pages-articles[0-9]*.xml.*(bz2|7z)',
                        '.*-pages-meta-current[0-9]*.xml.*(bz2|7z)',
                        '.*-pages-meta-history[0-9]*.xml.*(bz2|7z)',
                        '.*-flowhistory.xml.gz',
                        '.*dumpruninfo.txt']
        $patterns= join($patternslist, "\n")
        $patternsfile = '/etc/dumps/xml_keep_patterns.conf'
        file { $patternsfile:
            ensure  => 'present',
            path    => $patternsfile,
            mode    => '0644',
            owner   => 'root',
            group   => 'root',
            content => "${patterns}\n",
        }
    }

    file { '/usr/local/bin/cleanup_old_xmldumps.py':
        ensure => 'present',
        path   => '/usr/local/bin/cleanup_old_xmldumps.py',
        mode   => '0644',
        owner  => 'root',
        group  => 'root',
        source => 'puppet:///modules/dumps/web/cleanups/cleanup_old_xmldumps.py',
    }

    $xmlclean = '/usr/bin/python3 /usr/local/bin/cleanup_old_xmldumps.py'
    $args = "-d ${xmldumpsdir} -w ${wikilist_dir} -k /etc/dumps/xml_keeps.conf"

    if ($isreplica == false) {
        # the temp dir only exists on the generating hosts (nfs servers),
        # so only clean up temp files there
        $tempclean = "/usr/bin/find ${dumpstempdir} -type f -mtime +20 -exec rm {} \\;"
        # patternsfile has patterns that match dump output files we want to keep,
        # for dump runs we don't want to remove completely, on the dumps generator nfs hosts
        $cron_commands = "${xmlclean} ${args} -p ${patternsfile} ; ${tempclean}"
    } else {
        $cron_commands = "${xmlclean} ${args}"
    }
    cron { 'cleanup_xmldumps':
        ensure      => 'present',
        environment => 'MAILTO=ops-dumps@wikimedia.org',
        command     => $cron_commands,
        user        => $user,
        minute      => '25',
        hour        => '1',
        require     => File['/usr/local/bin/cleanup_old_xmldumps.py'],
    }
}