ArchiveOrgJsonOnlineDumpFile.java

package org.wikidata.analyzer.Fetcher;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikidata.wdtk.dumpfiles.DumpContentType;
import org.wikidata.wdtk.dumpfiles.MwDumpFile;
import org.wikidata.wdtk.dumpfiles.wmf.WmfDumpFile;
import org.wikidata.wdtk.util.CompressionType;
import org.wikidata.wdtk.util.DirectoryManager;
import org.wikidata.wdtk.util.WebResourceFetcher;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

/**
 * Class to download dumps from archive.org
 *
 * @author Addshore
 */
class ArchiveOrgJsonOnlineDumpFile extends WmfDumpFile implements MwDumpFile {

    private static final Logger logger = LoggerFactory
            .getLogger(ArchiveOrgJsonOnlineDumpFile.class);

    private final WebResourceFetcher webResourceFetcher;
    private final DirectoryManager dumpfileDirectoryManager;

    private boolean isPrepared;

    /**
     * Constructor. Currently only "wikidatawiki" is supported as a project.
     *
     * @param dateStamp
     *            dump date in format YYYYMMDD
     * @param projectName
     *            project name string (e.g. "wikidatawiki")
     * @param webResourceFetcher
     *            object to use for accessing the web
     * @param dumpfileDirectoryManager
     *            the directory manager for the directory where dumps should be
     *            downloaded to
     */
    ArchiveOrgJsonOnlineDumpFile(String dateStamp, String projectName,
                                 WebResourceFetcher webResourceFetcher,
                                 DirectoryManager dumpfileDirectoryManager) {
        super(dateStamp, projectName);
        this.webResourceFetcher = webResourceFetcher;
        this.dumpfileDirectoryManager = dumpfileDirectoryManager;
    }

    @Override
    public DumpContentType getDumpContentType() {
        return DumpContentType.JSON;
    }

    @Override
    protected boolean fetchIsDone() {
        return true;
    }

    @Override
    public InputStream getDumpFileStream() throws IOException {
        prepareDumpFile();

        String fileName = WmfDumpFile.getDumpFileName(DumpContentType.JSON,
                this.projectName, this.dateStamp);
        DirectoryManager dailyDirectoryManager = this.dumpfileDirectoryManager
                .getSubdirectoryManager(WmfDumpFile.getDumpFileDirectoryName(
                        DumpContentType.JSON, this.dateStamp));

        return dailyDirectoryManager.getInputStreamForFile(
                fileName,
                CompressionType.GZIP
        );
    }

    @Override
    public void prepareDumpFile() throws IOException {
        if (this.isPrepared) {
            return;
        }

        String fileName = WmfDumpFile.getDumpFileName(DumpContentType.JSON,
                this.projectName, this.dateStamp);

        List<String> urls = new ArrayList<>();

        // Like http://archive.org/download/wikidata-json-20160104/wikidata-20160104-all.json.gz
        urls.add( "http://archive.org/download/wikidata-json-" + this.dateStamp + "/wikidata-" + this.dateStamp + "-all.json.gz" );
        // Like https://archive.org/download/wikidata-json-20141020/20141020.json.gz
        urls.add( "http://archive.org/download/wikidata-json-" + this.dateStamp + "/" + this.dateStamp + ".json.gz" );

        for( String urlString : urls ) {
            try {
                logger.info("Downloading JSON dump file " + fileName + " from "
                        + urlString + " ...");

                if (!isAvailable()) {
                    continue;
                }

                DirectoryManager dailyDirectoryManager = this.dumpfileDirectoryManager
                        .getSubdirectoryManager(WmfDumpFile.getDumpFileDirectoryName(
                                DumpContentType.JSON, this.dateStamp));

                InputStream inputStream = webResourceFetcher
                        .getInputStreamForUrl(urlString);

                dailyDirectoryManager.createFileAtomic(fileName, inputStream);

                this.isPrepared = true;

                logger.info("... completed download of JSON dump file " + fileName
                        + " from " + urlString);
                return;
            } catch (Exception ignored) {
            }
        }

        throw new IOException(
                "Dump file not available (yet). Aborting dump retrieval."
        );
    }
}