User:MER-C/Spamsearch.java

From Wikipedia, the free encyclopedia
/**
 * @(#)Spamsearch.java 0.02 23/10/2007
 * Copyright (C) 2007 MER-C
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 3
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.concurrent.*;
import java.util.logging.*;
import javax.swing.*;
import java.util.regex.*;

/**
 *  Searches all Wikimedia wikis for spam. Usage: <tt>java Spamsearch
 *  example.com example.net ...</tt>, where example.com and example.net are the
 *  sites spammed. Outputs the results to a text file in the current directory
 *  (i.e. <tt>results.txt</tt>)
 *
 *  Requires Wiki.java 0.11.
 * 
 *  KNOWN ISSUES: multi-site search does not work for some reason.
 *
 *  @author MER-C
 *  @version 0.02
 */
public class Spamsearch
{
    private ArrayList<Wiki> wikis = new ArrayList(1333);
    private PrintWriter out; // output file
    private ProgressMonitor monitor; // progress monitor
    private int progress = 0;
    private int hits = 0; // number of links found
    
    public static void main(String[] args) throws IOException
    {
        new Spamsearch(args);
    }
    
    private Spamsearch(String[] args)
    {
        // check if command line arguments were specified
        if (args.length == 0)
        {
            String sites = JOptionPane.showInputDialog(null, "Enter sites to search");
            args = sites.split("\\s");
        }
        
        try
        {
            // various initialisation
            out = new PrintWriter(new FileWriter("results.txt"));
            out.println("Starting spamsearch at " + new Date() + ".");
            
            // suppress log records below INFO
	        Logger.getLogger("wiki").setLevel(Level.INFO);
	        
            // fetch site matrix
            Logger.getLogger("wiki").info("Fetching site matrix.");
            InputStream in = new URL("http://en.wikipedia.org/w/api.php?action=sitematrix&format=xml").openStream();
            BufferedReader reader = new BufferedReader(new InputStreamReader(in));
            String line = reader.readLine();
            
            // private wikis have API disabled and are NOT GOOD.
            // Current private wikis are anything containing "com." or ".en." and
            // (board|chair|exec|grants|internal|office|otrs-wiki|tlh|wikimaniateam).wikimedia.org.
            String pattern = "(com\\.|\\.en\\.|board|chair|exec|grants|internal|office|otrs|tlh|wikimaniateam)";
            Pattern p = Pattern.compile(pattern);
            
            // parse the list
            while (line.contains("url=\""))
            {
                int a = line.indexOf("url=\"") + 12;
                int b = line.indexOf("\"", a) - 1;
                String domain = line.substring(a, b);
                Matcher matcher = p.matcher(domain);
                if (matcher.find()) // private wiki, WOOP WOOP WOOP
                {
                    line = line.substring(b);
                    continue;
                }

            	Wiki wiki = new Wiki(domain);
                wikis.add(wiki);
                
                line = line.substring(b);
            }
            
            // now do the searching
            for (int i = 0; i < args.length; i++)
            {
                // reset progress monitor
                monitor = new ProgressMonitor(new JFrame(), "Searching for spamlink ", args[i], 0, wikis.size());
                
                // resolve the website
                InetAddress[] addresses = InetAddress.getAllByName(args[i]);
                for (int j = 0; j < addresses.length; j++)
                    out.println(addresses[j]);
                out.println("Searching " + wikis.size() + " wikis.\n");
                
                // search for links
                for (int j = 0; j < wikis.size(); j++)
                {
                    newThread("*." + args[i], j);
                    if (j % 16 == 15) // wait for a while
                        Thread.sleep(8500);
                }
                
                synchronized(out)
                {
                    out.wait();
                    Thread.sleep(7500);
                    out.println("" + hits + " links found.\n");
                }
                
                // recycle monitor
                monitor.close();
                monitor = null;
                progress = 0;
            }
        }
        catch (Exception ex)
        {
            if (!(ex instanceof InterruptedException))
            {
                ex.printStackTrace();
                System.exit(2);
            }
        }
        synchronized (out)
        {
            out.close();
        }
        System.exit(0);
    }
    
    /**
     *  Speed optimisation (runtime approx 4200s beforehand) because the
     *  internet is the major limitation. Don't you just love multithreading?
     */
    private void newThread(final String domain, final int i)
    {
        new Thread()
        {
            public void run()
            {
                Wiki wiki = wikis.get(i);
                wiki.setMaxLag(-1); // disable maxlag for performance
                try
                {
                    // do spamsearch
                    ArrayList[] links = wiki.spamsearch(domain);
                    hits += links[0].size();
                    
                    synchronized(out) // so the output file doesn't get messed up
                    {
                        // don't print anything if there are no results
                        if (!links[0].isEmpty()) 
                        {
                            out.println("Results for " + wiki.getDomain() + "...");
                            for (int k = 0; k < links[0].size(); k++)
                                out.println("Page: " + links[0].get(k) + " URL: " + links[1].get(k));
                            out.println();
                        }
                        
                        // done spamsearching
                        if (i == wikis.size() - 1)
                        {
                            out.flush();
                            out.notifyAll();
                        }
                    }
                }
                catch (IOException ex)
                {
                    System.err.println(ex);
                    out.flush();
                    System.exit(2);
                }
                
                // update the progress monitor
                SwingUtilities.invokeLater(new Runnable()
                {
                    public void run()
                    {
                        if (monitor != null)
                            monitor.setProgress(++progress);
                    }
                });
            }
        }.start();
    }
}