View Javadoc
1   package org.wikidata.analyzer.Fetcher;
2   
3   import org.wikidata.wdtk.dumpfiles.MwDumpFile;
4   import org.wikidata.wdtk.dumpfiles.MwLocalDumpFile;
5   import org.wikidata.wdtk.dumpfiles.wmf.JsonOnlineDumpFile;
6   import org.wikidata.wdtk.util.DirectoryManager;
7   import org.wikidata.wdtk.util.DirectoryManagerImpl;
8   import org.wikidata.wdtk.util.WebResourceFetcher;
9   
10  import java.io.File;
11  import java.io.IOException;
12  import java.nio.file.Files;
13  import java.nio.file.Paths;
14  import java.util.ArrayList;
15  import java.util.HashMap;
16  import java.util.List;
17  import java.util.Map;
18  
19  /**
20   * @author Addshore
21   */
22  public class DumpFetcher {
23  
24      private File dataDirectory;
25  
26      public DumpFetcher(File dataDirectory) {
27          this.dataDirectory = dataDirectory;
28      }
29  
30      /**
31       * Look for the most recent dump date online and try to retrieve as dump object with fallback:
32       * 1 - Look for local dump copies (in a collection of locations)
33       * 2 - Look online & download dumps
34       *
35       * @return MwDumpFile
36       * @throws IOException
37       */
38      public MwDumpFile getDump( String dumpDate ) throws IOException {
39          System.out.println("Getting dump with date " + dumpDate);
40  
41          // Look for the dump in a list of possible local locations
42          List<String> directoryList = new ArrayList<>();
43          //Local data dir location
44          directoryList.add(this.dataDirectory + "/dumpfiles/json-" + dumpDate + "/");
45          //Labs dump location
46          directoryList.add("/public/dumps/public/wikidatawiki/entities/" + dumpDate + "/");
47          //Stat1002 dump location
48          directoryList.add("/mnt/data/xmldatadumps/public/wikidatawiki/entities/" + dumpDate + "/");
49  
50          for (String dumpDirectory: directoryList) {
51              System.out.println("Looking for dump files in: " + dumpDirectory);
52  
53              // Try and few different file names
54              List<String> fileList = new ArrayList<>();
55              fileList.add(dumpDirectory + dumpDate + ".json.gz");
56              fileList.add(dumpDirectory + dumpDate + "-all.json.gz");
57              fileList.add(dumpDirectory + "wikidata-" + dumpDate + ".json.gz");
58              fileList.add(dumpDirectory + "wikidata-" + dumpDate + "-all.json.gz");
59  
60              for (String dumpLocation: fileList) {
61                  if (Files.exists(Paths.get(dumpLocation)) && Files.isReadable(Paths.get(dumpLocation))) {
62                      MwLocalDumpFile localDumpFile = new MwLocalDumpFile( dumpLocation );
63                      if( localDumpFile.isAvailable() ) {
64                          System.out.println("Using dump file from: " + dumpLocation);
65                          localDumpFile.prepareDumpFile();
66                          return localDumpFile;
67                      }
68                  }
69              }
70  
71          }
72  
73          // Get ready to try online dumps
74          DirectoryManager localDirectoryManager = new DirectoryManagerImpl(
75                  Paths.get(this.dataDirectory.getAbsolutePath() + File.separator + "dumpfiles"),
76                  false
77          );
78          WebResourceFetcher fetcher = new RedirectFollowingWebResourceFetcherImpl();
79  
80          // List the online dumps
81          Map<String, MwDumpFile> onlineDumpMap = new HashMap<String, MwDumpFile>();
82          // dumps.wikimedia.org
83          onlineDumpMap.put(
84                  "dumps.wikimedia.org",
85                  new JsonOnlineDumpFile(dumpDate, "wikidatawiki", fetcher, localDirectoryManager)
86          );
87          onlineDumpMap.put(
88                  "archive.org",
89                  new ArchiveOrgJsonOnlineDumpFile(dumpDate, "wikidatawiki", fetcher, localDirectoryManager)
90          );
91  
92          // Try the online dumps
93          for ( Map.Entry<String, MwDumpFile> entry : onlineDumpMap.entrySet() ) {
94              String dumpLocation = entry.getKey();
95              MwDumpFile onlineDump = entry.getValue();
96              try{
97                  System.out.println("Looking for & downloading online dump from: " + dumpLocation);
98                  onlineDump.prepareDumpFile();
99                  System.out.println("Using dump from: " + dumpLocation);
100                 return onlineDump;
101             } catch ( IOException exception ) {
102                 // Ignore the exception so we can try the next online dump
103             }
104         }
105 
106         // Everything failed! :(
107         throw new IOException("Failed to get dump from any sources");
108     }
109 }