Puppet Class: dumps::web::cleanups::xmldumps

Defined in:
modules/dumps/manifests/web/cleanups/xmldumps.pp

Overview

Parameters:

  • xmldumpsdir (Any) (defaults to: undef)
  • dumpstempdir (Any) (defaults to: undef)
  • user (Any) (defaults to: undef)
  • isreplica (Any) (defaults to: undef)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'modules/dumps/manifests/web/cleanups/xmldumps.pp', line 1

class dumps::web::cleanups::xmldumps(
    $xmldumpsdir = undef,
    $dumpstempdir = undef,
    $user = undef,
    $isreplica = undef,
) {
    $wikilist_dir = '/etc/dumps/dblists'
    file { $wikilist_dir:
        ensure => 'directory',
        path   => $wikilist_dir,
        mode   => '0755',
        owner  => 'root',
        group  => 'root',
    }

    # these lists are used only to decide how many dumps of
    # each type of wiki we keep.
    $bigwikis = ['dewiki', 'eswiki', 'frwiki', 'itwiki', 'jawiki',
                  'metawiki', 'nlwiki', 'plwiki', 'ptwiki', 'ruwiki',
                  'commonswiki', 'svwiki', 'zhwiki', 'kowiki']
    $bigwikis_dblist = join($bigwikis, "\n")

    $hugewikis = ['enwiki', 'wikidatawiki']
    $hugewikis_dblist = join($hugewikis, "\n")

    file { "${wikilist_dir}/hugewikis.dblist":
        ensure  => 'present',
        path    => "${wikilist_dir}/hugewikis.dblist",
        mode    => '0644',
        owner   => 'root',
        group   => 'root',
        content => "${hugewikis_dblist}\n",
    }

    file { "${wikilist_dir}/bigwikis.dblist":
        ensure  => 'present',
        path    => "${wikilist_dir}/bigwikis.dblist",
        mode    => '0644',
        owner   => 'root',
        group   => 'root',
        content => "${bigwikis_dblist}\n",
    }

    # how many dumps we keep of each type. in practice we keep one
    # less, so that when a new dump run starts and partial dumps are
    # copied over to the web server, space is available for that new
    # run BEFORE it is copied.

    # on generator nfs hosts we must keep a minimum of 3 so that at any time
    # we have at least one old full dump around, with all revision content
    # which can be stolen from for the next dump run.  This is due to
    # the way we run dumps: one full run, then one run without full
    # revision content, etc.
    # we would like to keep partials of 2 dumps, for prefetch purposes,
    # just in case there's an issue with the last full run. This is most important
    # for the small wikis, for which we don't generate page dumps in small pieces
    # and so broken prefetch files mean pulling all historical revision content
    # directly from the database.

    $keep_generator = ['hugewikis.dblist:3:1', 'bigwikis.dblist:3:1', 'default:3:1']
    $keep_replicas = ['hugewikis.dblist:7', 'bigwikis.dblist:8', 'default:10']

    if ($isreplica == true) {
        $content= join($keep_replicas, "\n")
    } else {
        $content= join($keep_generator, "\n")
    }

    file { '/etc/dumps/xml_keeps.conf':
        ensure  => 'present',
        path    => '/etc/dumps/xml_keeps.conf',
        mode    => '0644',
        owner   => 'root',
        group   => 'root',
        content => "${content}\n",
    }

    # set up the file containing expressions to match dump output
    # files we need to keep around, for those dumps we don't remove
    # completely, on the dumps generator nfs hosts.
    if ($isreplica == false) {
        $patternslist = ['.*-pages-articles[0-9]*.xml.*(bz2|7z)',
                        '.*-pages-meta-current[0-9]*.xml.*(bz2|7z)',
                        '.*-pages-meta-history[0-9]*.xml.*(bz2|7z)',
                        '.*-flowhistory.xml.gz',
                        '.*dumpruninfo.txt']
        $patterns= join($patternslist, "\n")
        $patternsfile = '/etc/dumps/xml_keep_patterns.conf'
        file { $patternsfile:
            ensure  => 'present',
            path    => $patternsfile,
            mode    => '0644',
            owner   => 'root',
            group   => 'root',
            content => "${patterns}\n",
        }
    }

    file { '/usr/local/bin/cleanup_old_xmldumps.py':
        ensure => 'present',
        path   => '/usr/local/bin/cleanup_old_xmldumps.py',
        mode   => '0644',
        owner  => 'root',
        group  => 'root',
        source => 'puppet:///modules/dumps/web/cleanups/cleanup_old_xmldumps.py',
    }

    $xmlclean = '/usr/bin/python3 /usr/local/bin/cleanup_old_xmldumps.py'
    $args = "-d ${xmldumpsdir} -w ${wikilist_dir} -k /etc/dumps/xml_keeps.conf"

    if ($isreplica == false) {
        # the temp dir only exists on the generating hosts (nfs servers),
        # so only clean up temp files there
        $tempclean = "/usr/bin/find ${dumpstempdir} -type f -mtime +20 -exec rm {} \\;"
        systemd::timer::job { 'cleanup_tmpdumps':
            ensure             => present,
            description        => 'Regular jobs to clean up tmp dumps',
            user               => $user,
            monitoring_enabled => false,
            send_mail          => true,
            environment        => {'MAILTO' => 'ops-dumps@wikimedia.org'},
            command            => $tempclean,
            interval           => {'start' => 'OnCalendar', 'interval' => '*-*-* 2:25:0'},
            require            => File['/usr/local/bin/cleanup_old_xmldumps.py'],
        }
        # patternsfile has patterns that match dump output files we want to keep,
        # for dump runs we don't want to remove completely, on the dumps generator nfs hosts
        $job_command = "${xmlclean} ${args} -p ${patternsfile}"
    } else {
        $job_command = "${xmlclean} ${args}"
    }
    systemd::timer::job { 'cleanup_xmldumps':
        ensure             => present,
        description        => 'Regular jobs to clean up xml dumps',
        user               => $user,
        monitoring_enabled => false,
        send_mail          => true,
        environment        => {'MAILTO' => 'ops-dumps@wikimedia.org'},
        command            => $job_command,
        interval           => {'start' => 'OnCalendar', 'interval' => '*-*-* 9:25:0'},
        require            => File['/usr/local/bin/cleanup_old_xmldumps.py'],
    }
}