1 package org.wikidata.analyzer.Fetcher;
2
3 import org.wikidata.wdtk.dumpfiles.MwDumpFile;
4 import org.wikidata.wdtk.dumpfiles.MwLocalDumpFile;
5 import org.wikidata.wdtk.dumpfiles.wmf.JsonOnlineDumpFile;
6 import org.wikidata.wdtk.util.DirectoryManager;
7 import org.wikidata.wdtk.util.DirectoryManagerImpl;
8 import org.wikidata.wdtk.util.WebResourceFetcher;
9
10 import java.io.File;
11 import java.io.IOException;
12 import java.nio.file.Files;
13 import java.nio.file.Paths;
14 import java.util.ArrayList;
15 import java.util.HashMap;
16 import java.util.List;
17 import java.util.Map;
18
19
20
21
22 public class DumpFetcher {
23
24 private File dataDirectory;
25
26 public DumpFetcher(File dataDirectory) {
27 this.dataDirectory = dataDirectory;
28 }
29
30
31
32
33
34
35
36
37
38 public MwDumpFile getDump( String dumpDate ) throws IOException {
39 System.out.println("Getting dump with date " + dumpDate);
40
41
42 List<String> directoryList = new ArrayList<>();
43
44 directoryList.add(this.dataDirectory + "/dumpfiles/json-" + dumpDate + "/");
45
46 directoryList.add("/public/dumps/public/wikidatawiki/entities/" + dumpDate + "/");
47
48 directoryList.add("/mnt/data/xmldatadumps/public/wikidatawiki/entities/" + dumpDate + "/");
49
50 for (String dumpDirectory: directoryList) {
51 System.out.println("Looking for dump files in: " + dumpDirectory);
52
53
54 List<String> fileList = new ArrayList<>();
55 fileList.add(dumpDirectory + dumpDate + ".json.gz");
56 fileList.add(dumpDirectory + dumpDate + "-all.json.gz");
57 fileList.add(dumpDirectory + "wikidata-" + dumpDate + ".json.gz");
58 fileList.add(dumpDirectory + "wikidata-" + dumpDate + "-all.json.gz");
59
60 for (String dumpLocation: fileList) {
61 if (Files.exists(Paths.get(dumpLocation)) && Files.isReadable(Paths.get(dumpLocation))) {
62 MwLocalDumpFile localDumpFile = new MwLocalDumpFile( dumpLocation );
63 if( localDumpFile.isAvailable() ) {
64 System.out.println("Using dump file from: " + dumpLocation);
65 localDumpFile.prepareDumpFile();
66 return localDumpFile;
67 }
68 }
69 }
70
71 }
72
73
74 DirectoryManager localDirectoryManager = new DirectoryManagerImpl(
75 Paths.get(this.dataDirectory.getAbsolutePath() + File.separator + "dumpfiles"),
76 false
77 );
78 WebResourceFetcher fetcher = new RedirectFollowingWebResourceFetcherImpl();
79
80
81 Map<String, MwDumpFile> onlineDumpMap = new HashMap<String, MwDumpFile>();
82
83 onlineDumpMap.put(
84 "dumps.wikimedia.org",
85 new JsonOnlineDumpFile(dumpDate, "wikidatawiki", fetcher, localDirectoryManager)
86 );
87 onlineDumpMap.put(
88 "archive.org",
89 new ArchiveOrgJsonOnlineDumpFile(dumpDate, "wikidatawiki", fetcher, localDirectoryManager)
90 );
91
92
93 for ( Map.Entry<String, MwDumpFile> entry : onlineDumpMap.entrySet() ) {
94 String dumpLocation = entry.getKey();
95 MwDumpFile onlineDump = entry.getValue();
96 try{
97 System.out.println("Looking for & downloading online dump from: " + dumpLocation);
98 onlineDump.prepareDumpFile();
99 System.out.println("Using dump from: " + dumpLocation);
100 return onlineDump;
101 } catch ( IOException exception ) {
102
103 }
104 }
105
106
107 throw new IOException("Failed to get dump from any sources");
108 }
109 }