lContentDownloader: Regex Multi-file Downloader
A Java program to download multiple files from a site using regular expressions and wildcards.
/* * lContentDownloader * * Downloads files from websites which use numbers as page iterators. * e.g. http://www.notexistingexample.com/posts/12 * This program is mainly written for downloading images from blogs. * The files to be downloaded can be specified by regular expressions. (Search will be done by url) * The program will exit if 50 fails happen in a row (already existing filename, connection problem, etc.) * * Tested with: * http://www.photoschau.de/?paged=* regex: uploads * http://momentslikethis.de/page/* regex: uploads * http://blog.flickr.net/en/page/* regex: staticflickr (need a better regex) * http://www.inspirational-images.tumblr.com/page/* regex: tumblr_ * http://www.philmfotos.tumblr.com/page/* regex: tumblr_ * * @author László Ádám * january 6. 2014 * @version 0.1 * */ import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.BufferedInputStream; import java.io.InputStream; import java.io.FileOutputStream; import java.net.URLConnection; import java.net.URL; import java.io.File; import java.util.regex.Matcher; import java.util.regex.Pattern; class lContentDownloader { private static int total; // downloadAndSave // public static boolean downloadAndSave(String fileurl, String path) { if (fileurl.equals("")) { System.out.println("Relative path, sorry..."); return false; } try { int size = 0; String filename = fileurl.split("/")[fileurl.split("/").length-1]; if (path.charAt(path.length()-1) != '/') { path = path + "/"; } File f = new File(path + filename); if (f.exists()) { System.out.println("\tThe file is already exists."); return false; } else { f.createNewFile(); } URL url = new URL(fileurl); InputStream is = null; FileOutputStream fos = null; URLConnection urlConn = url.openConnection(); is = urlConn.getInputStream(); fos = new FileOutputStream(path + filename); byte[] buffer = new byte[4096]; int len; while ((len = is.read(buffer)) > 0) { size += len; fos.write(buffer, 0, len); } is.close(); fos.close(); total += size; System.out.println("\tDownloaded: [~" + size/1024 + " KB] Total: [~" + total/1024/1024 + " MB]"); } catch (Exception e) { System.out.println(e); } return true; } // Download // public static boolean download(String address, String path, String regex) { boolean found = false; boolean error = false; try { System.out.println("Looking up: " + address); URL url = new URL(address); String page = ""; BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream())); String line; while ((line = reader.readLine()) != null) { page = page + line; } reader.close(); Pattern pattern = Pattern.compile(regex); Matcher matcher; String[] tokens = page.split(" "); for (int i = 0; i < tokens.length; i++) { if (tokens[i].contains("src=")) { matcher = pattern.matcher(tokens[i]); if (matcher.find()) { found = true; String fileurl = ""; if (tokens[i].indexOf("http://") >= 0) { // flickr does it tricky fileurl = tokens[i].substring(tokens[i].indexOf("http://"), tokens[i].length()-1); } System.out.println("\tFound: "+ fileurl); if (!downloadAndSave(fileurl, path)) { error = true; } } } } if (!found) { System.out.println("\tNothing found here."); return false; } return !error; } catch (Exception e) { System.out.println(e); return false; } } public static void main(String[] args) throws Exception { if (args.length != 3) { System.out.println("Usage:\njava lContentDownloader address path regex"); System.out.println("- address: URL, must contain '*' substitution character"); System.out.println("- path: path to save files"); System.out.println("- regex: regular expression to specificate the requested content's urls"); System.exit(1); } if (!args[0].contains("*")) { System.out.println("URL must contain the '*' substitution character."); System.exit(1); } File f = new File(args[1]); if (!f.exists() || !f.isDirectory()) { System.out.println("The path does not exist or is not a directory."); System.exit(1); } total = 0; int fails = 0; int number = 1; boolean succeed; while (true) { succeed = download(args[0].replace("*", Integer.toString(number)), args[1], args[2]); number++; if (!succeed) { fails++; }else { fails = 0; } if (fails != 0) { System.out.println("\tFails: " + fails); } if (fails == 50) { System.out.println("50 fails exceeded in a row. The program will quit."); break; } } } }

This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
Download this code in plain text format here