DumpDateFetcher.java
package org.wikidata.analyzer.Fetcher;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.wikidata.wdtk.util.WebResourceFetcher;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
/**
* @author Addshore
*/
public class DumpDateFetcher {
/**
* This presumes that the newest dump is at the bottom of the list (ordered by name)
*
* @return String in format yyyymmdd eg. 20150525
* @throws IOException
*/
public String getLatestOnlineDumpDate() throws IOException {
String html = this.getJsonDumpsPageHtml();
Document doc = Jsoup.parse(html);
Element finalLink = doc.select("a").last();
String fileName = finalLink.html();
return fileName.substring(0, fileName.length() - 8);// remove 8 chars
}
/**
* @return String html of the json dumps page for wikidata
* @throws IOException
*/
private String getJsonDumpsPageHtml() throws IOException {
WebResourceFetcher fetcher = new RedirectFollowingWebResourceFetcherImpl();
Reader r = new InputStreamReader(fetcher.getInputStreamForUrl("http://dumps.wikimedia.org/other/wikidata/"));
StringBuilder buf = new StringBuilder();
while (true) {
int ch = r.read();
if (ch < 0)
break;
buf.append((char) ch);
}
return buf.toString();
}
}