/** * @(#)Spamsearch.java 0.02 23/10/2007 * Copyright (C) 2007 MER-C * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */importjava.io.*;importjava.net.*;importjava.util.*;importjava.util.concurrent.*;importjava.util.logging.*;importjavax.swing.*;importjava.util.regex.*;/** * Searches all Wikimedia wikis for spam. Usage: <tt>java Spamsearch * example.com example.net ...</tt>, where example.com and example.net are the * sites spammed. Outputs the results to a text file in the current directory * (i.e. <tt>results.txt</tt>) * * Requires Wiki.java 0.11. * * KNOWN ISSUES: multi-site search does not work for some reason. * * @author MER-C * @version 0.02 */publicclassSpamsearch{privateArrayList<Wiki>wikis=newArrayList(1333);privatePrintWriterout;// output fileprivateProgressMonitormonitor;// progress monitorprivateintprogress=0;privateinthits=0;// number of links foundpublicstaticvoidmain(String[]args)throwsIOException{newSpamsearch(args);}privateSpamsearch(String[]args){// check if command line arguments were specifiedif(args.length==0){Stringsites=JOptionPane.showInputDialog(null,"Enter sites to search");args=sites.split("\\s");}try{// various initialisationout=newPrintWriter(newFileWriter("results.txt"));out.println("Starting spamsearch at "+newDate()+".");// suppress log records below INFOLogger.getLogger("wiki").setLevel(Level.INFO);// fetch site matrixLogger.getLogger("wiki").info("Fetching site matrix.");InputStreamin=newURL("http://en.wikipedia.org/w/api.php?action=sitematrix&format=xml").openStream();BufferedReaderreader=newBufferedReader(newInputStreamReader(in));Stringline=reader.readLine();// private wikis have API disabled and are NOT GOOD.// Current private wikis are anything containing "com." or ".en." and// (board|chair|exec|grants|internal|office|otrs-wiki|tlh|wikimaniateam).wikimedia.org.Stringpattern="(com\\.|\\.en\\.|board|chair|exec|grants|internal|office|otrs|tlh|wikimaniateam)";Patternp=Pattern.compile(pattern);// parse the listwhile(line.contains("url=\"")){inta=line.indexOf("url=\"")+12;intb=line.indexOf("\"",a)-1;Stringdomain=line.substring(a,b);Matchermatcher=p.matcher(domain);if(matcher.find())// private wiki, WOOP WOOP WOOP{line=line.substring(b);continue;}Wikiwiki=newWiki(domain);wikis.add(wiki);line=line.substring(b);}// now do the searchingfor(inti=0;i<args.length;i++){// reset progress monitormonitor=newProgressMonitor(newJFrame(),"Searching for spamlink ",args[i],0,wikis.size());// resolve the websiteInetAddress[]addresses=InetAddress.getAllByName(args[i]);for(intj=0;j<addresses.length;j++)out.println(addresses[j]);out.println("Searching "+wikis.size()+" wikis.\n");// search for linksfor(intj=0;j<wikis.size();j++){newThread("*."+args[i],j);if(j%16==15)// wait for a whileThread.sleep(8500);}synchronized(out){out.wait();Thread.sleep(7500);out.println(""+hits+" links found.\n");}// recycle monitormonitor.close();monitor=null;progress=0;}}catch(Exceptionex){if(!(exinstanceofInterruptedException)){ex.printStackTrace();System.exit(2);}}synchronized(out){out.close();}System.exit(0);}/** * Speed optimisation (runtime approx 4200s beforehand) because the * internet is the major limitation. Don't you just love multithreading? */privatevoidnewThread(finalStringdomain,finalinti){newThread(){publicvoidrun(){Wikiwiki=wikis.get(i);wiki.setMaxLag(-1);// disable maxlag for performancetry{// do spamsearchArrayList[]links=wiki.spamsearch(domain);hits+=links[0].size();synchronized(out)// so the output file doesn't get messed up{// don't print anything if there are no resultsif(!links[0].isEmpty()){out.println("Results for "+wiki.getDomain()+"...");for(intk=0;k<links[0].size();k++)out.println("Page: "+links[0].get(k)+" URL: "+links[1].get(k));out.println();}// done spamsearchingif(i==wikis.size()-1){out.flush();out.notifyAll();}}}catch(IOExceptionex){System.err.println(ex);out.flush();System.exit(2);}// update the progress monitorSwingUtilities.invokeLater(newRunnable(){publicvoidrun(){if(monitor!=null)monitor.setProgress(++progress);}});}}.start();}}