DumpFetcher.java
package org.wikidata.analyzer.Fetcher;
import org.wikidata.wdtk.dumpfiles.MwDumpFile;
import org.wikidata.wdtk.dumpfiles.MwLocalDumpFile;
import org.wikidata.wdtk.dumpfiles.wmf.JsonOnlineDumpFile;
import org.wikidata.wdtk.util.DirectoryManager;
import org.wikidata.wdtk.util.DirectoryManagerImpl;
import org.wikidata.wdtk.util.WebResourceFetcher;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author Addshore
*/
public class DumpFetcher {
private File dataDirectory;
public DumpFetcher(File dataDirectory) {
this.dataDirectory = dataDirectory;
}
/**
* Look for the most recent dump date online and try to retrieve as dump object with fallback:
* 1 - Look for local dump copies (in a collection of locations)
* 2 - Look online & download dumps
*
* @return MwDumpFile
* @throws IOException
*/
public MwDumpFile getDump( String dumpDate ) throws IOException {
System.out.println("Getting dump with date " + dumpDate);
// Look for the dump in a list of possible local locations
List<String> directoryList = new ArrayList<>();
//Local data dir location
directoryList.add(this.dataDirectory + "/dumpfiles/json-" + dumpDate + "/");
//Labs dump location
directoryList.add("/public/dumps/public/wikidatawiki/entities/" + dumpDate + "/");
//Stat1002 dump location
directoryList.add("/mnt/data/xmldatadumps/public/wikidatawiki/entities/" + dumpDate + "/");
for (String dumpDirectory: directoryList) {
System.out.println("Looking for dump files in: " + dumpDirectory);
// Try and few different file names
List<String> fileList = new ArrayList<>();
fileList.add(dumpDirectory + dumpDate + ".json.gz");
fileList.add(dumpDirectory + dumpDate + "-all.json.gz");
fileList.add(dumpDirectory + "wikidata-" + dumpDate + ".json.gz");
fileList.add(dumpDirectory + "wikidata-" + dumpDate + "-all.json.gz");
for (String dumpLocation: fileList) {
if (Files.exists(Paths.get(dumpLocation)) && Files.isReadable(Paths.get(dumpLocation))) {
MwLocalDumpFile localDumpFile = new MwLocalDumpFile( dumpLocation );
if( localDumpFile.isAvailable() ) {
System.out.println("Using dump file from: " + dumpLocation);
localDumpFile.prepareDumpFile();
return localDumpFile;
}
}
}
}
// Get ready to try online dumps
DirectoryManager localDirectoryManager = new DirectoryManagerImpl(
Paths.get(this.dataDirectory.getAbsolutePath() + File.separator + "dumpfiles"),
false
);
WebResourceFetcher fetcher = new RedirectFollowingWebResourceFetcherImpl();
// List the online dumps
Map<String, MwDumpFile> onlineDumpMap = new HashMap<String, MwDumpFile>();
// dumps.wikimedia.org
onlineDumpMap.put(
"dumps.wikimedia.org",
new JsonOnlineDumpFile(dumpDate, "wikidatawiki", fetcher, localDirectoryManager)
);
onlineDumpMap.put(
"archive.org",
new ArchiveOrgJsonOnlineDumpFile(dumpDate, "wikidatawiki", fetcher, localDirectoryManager)
);
// Try the online dumps
for ( Map.Entry<String, MwDumpFile> entry : onlineDumpMap.entrySet() ) {
String dumpLocation = entry.getKey();
MwDumpFile onlineDump = entry.getValue();
try{
System.out.println("Looking for & downloading online dump from: " + dumpLocation);
onlineDump.prepareDumpFile();
System.out.println("Using dump from: " + dumpLocation);
return onlineDump;
} catch ( IOException exception ) {
// Ignore the exception so we can try the next online dump
}
}
// Everything failed! :(
throw new IOException("Failed to get dump from any sources");
}
}