Puppet Class: dumps::web::fetches::wikitech_dumps

Defined in:
modules/dumps/manifests/web/fetches/wikitech_dumps.pp

Overview

Parameters:

  • url (Any) (defaults to: undef)
  • miscdatasetsdir (Any) (defaults to: undef)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'modules/dumps/manifests/web/fetches/wikitech_dumps.pp', line 1

class dumps::web::fetches::wikitech_dumps(
    $url            = undef,
    $miscdatasetsdir = undef,
) {

    $wikitechdir = "${miscdatasetsdir}/wikitech"

    file { $wikitechdir:
        ensure => 'directory',
        owner  => 'root',
        group  => 'root',
        mode   => '0755',
    }

    $wget = '/usr/bin/wget'
    # don't get anything with query params
    $wgetreject = "--reject-regex '(.*)\\?(.*)'"
    $wgetargs = "-nv -e robots=off -k -nH --wait 30 -np -m ${url} -P ${wikitechdir}"
    # filter out any whines about missing timestamp for index.html
    # as well url download announcements and summary, we only care about anything else
    $filter = "2>&1  | grep -E -v '(turned off|URL:http|FINISHED|Total|Downloaded)'"
    # toss wikitech's autogenerated index.html files when done
    $cleanuphtml = "find ${wikitechdir} -name 'index.html*' -exec rm {} \\;"
    # remove dumps older than 90 days
    $cleanupold = "find ${wikitechdir} -type f -mtime +90 -exec rm {} \\;"

    cron { 'dumps-fetches-wikitech':
        ensure  => 'present',
        command => "${wget} ${wgetreject} ${wgetargs} ${filter}; ${cleanuphtml}; ${cleanupold}",
        user    => 'root',
        minute  => '20',
        hour    => '3',
    }
}