package comirva.web.indexing;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.Date;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.SimpleFSDirectory;

/* loaded from: input_file:comirva/web/indexing/HTMLIndexer.class */
public class HTMLIndexer {
    protected static final String ROOT_DIR = "C:/Research/Data/web_crawls/C412a/Google_2010-01-08/M/";
    protected static final String PATH_TO_INDEX = "C:/Research/Data/web_crawls/C412a/Google_2010-01-08/M/../M_index/";
    protected static final String LOG_FILE = "C:/Research/Data/web_crawls/C412a/Google_2010-01-08/M/../M_index/log.txt";
    protected static final String TERMS_TO_INDEX = "C:/Research/Data/web_crawls/music_terms.txt";
    protected static final boolean VERBOSE_OUTPUT = false;
    protected static final boolean CREATE_NEW_INDEX = true;
    protected static final int MAX_RAM_BUFFER_SIZE_MB = 256;
    protected static final long MAX_INDEXED_FILE_SIZE = 5000000;
    private static File indexRootDir = new File("C:/Research/Data/web_crawls/C412a/Google_2010-01-08/M/");
    private static String indexPath = "C:/Research/Data/web_crawls/C412a/Google_2010-01-08/M/../M_index/";
    private static boolean createNewIndex = true;
    private static boolean deleting = false;
    private static IndexReader reader;
    private static IndexWriter writer;
    private static TermEnum uidIter;
    private static File logFile;
    private static BufferedWriter bwLog;
    private static String url;
    private static String artist;

    static {
        logFile = LOG_FILE == 0 ? null : new File(LOG_FILE);
        bwLog = null;
    }

    public static void main(String[] strArr) {
        try {
            try {
                Date date = new Date();
                if (logFile != null) {
                    if (!logFile.getParentFile().exists()) {
                        logFile.getParentFile().mkdir();
                    }
                    bwLog = new BufferedWriter(new FileWriter(logFile, false));
                }
                if (!createNewIndex) {
                    deleting = true;
                    indexDocs(indexRootDir, indexPath, createNewIndex);
                }
                writer = new IndexWriter(new SimpleFSDirectory(new File(indexPath)), new HTMLAnalyzer(TERMS_TO_INDEX), createNewIndex, IndexWriter.MaxFieldLength.UNLIMITED);
                writer.setMaxFieldLength(10000);
                writer.setRAMBufferSizeMB(256.0d);
                indexDocs(indexRootDir, indexPath, createNewIndex);
                writeToLogAndSysOut("Optimizing index...\n");
                writer.optimize();
                writer.close();
                writeToLogAndSysOut(Long.toString(new Date().getTime() - date.getTime()));
                writeToLogAndSysOut(" total milliseconds\n");
                try {
                    bwLog.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            } catch (Throwable th) {
                try {
                    bwLog.close();
                } catch (IOException e2) {
                    e2.printStackTrace();
                }
                throw th;
            }
        } catch (Exception e3) {
            writeToLogAndSysOut(" caught a " + e3.getClass() + "\n with message: " + e3.getMessage() + "\n");
            try {
                bwLog.close();
            } catch (IOException e4) {
                e4.printStackTrace();
            }
        }
    }

    private static void indexDocs(File file, String str, boolean z) throws Exception {
        if (z) {
            indexDocs(file);
            return;
        }
        reader = IndexReader.open(new SimpleFSDirectory(new File(str)));
        uidIter = reader.terms(new Term("uid", ""));
        indexDocs(file);
        if (deleting) {
            while (uidIter.term() != null && uidIter.term().field() == "uid") {
                writeToLogAndSysOut("deleting " + HTMLDocument.uid2url(uidIter.term().text()) + "\n");
                reader.deleteDocuments(uidIter.term());
                uidIter.next();
            }
            deleting = false;
        }
        uidIter.close();
        reader.close();
    }

    private static void indexDocs(File file) throws Exception {
        String readLine;
        if (file.isDirectory()) {
            String[] list = file.list();
            Arrays.sort(list);
            System.out.println("processing directory " + file.getAbsolutePath());
            for (String str : list) {
                indexDocs(new File(file, str));
            }
            return;
        }
        if (file.getPath().endsWith(".html")) {
            File file2 = new File(file.getParentFile(), "urls.dat");
            if (file2.exists()) {
                BufferedReader bufferedReader = new BufferedReader(new FileReader(file2));
                if (file.getPath().endsWith(".html")) {
                    Integer valueOf = Integer.valueOf(file.getName().substring(0, file.getName().length() - 5));
                    int i = -1;
                    while (true) {
                        int i2 = i;
                        i++;
                        if (i2 >= valueOf.intValue() || (readLine = bufferedReader.readLine()) == null) {
                            break;
                        }
                        String trim = readLine.trim();
                        if (i == valueOf.intValue()) {
                            url = trim;
                        }
                    }
                    bufferedReader.close();
                }
            }
            artist = file.getParentFile().getName().trim();
            if (uidIter == null) {
                if (file.length() > MAX_INDEXED_FILE_SIZE) {
                    writeToLogAndSysOut("warning: " + file.getAbsolutePath() + " (" + file.length() + " bytes) exceeds the maximum file size of " + MAX_INDEXED_FILE_SIZE + " and thus will not be indexed\n");
                    return;
                }
                Document Document = HTMLDocument.Document(file, artist, url);
                writeToLogAndSysOut("adding " + Document.get("path") + ", artist:" + Document.get("artist") + "\n");
                writer.addDocument(Document);
                return;
            }
            String uid = HTMLDocument.uid(file);
            while (uidIter.term() != null && uidIter.term().field() == "uid" && uidIter.term().text().compareTo(uid) < 0) {
                if (deleting) {
                    writeToLogAndSysOut("deleting " + HTMLDocument.uid2url(uidIter.term().text()) + "\n");
                    reader.deleteDocuments(uidIter.term());
                }
                uidIter.next();
            }
            if (uidIter.term() != null && uidIter.term().field() == "uid" && uidIter.term().text().compareTo(uid) == 0) {
                writeToLogAndSysOut("document " + HTMLDocument.uid2url(uidIter.term().text()) + " already in index\n");
                uidIter.next();
            } else {
                if (deleting) {
                    return;
                }
                if (file.length() > MAX_INDEXED_FILE_SIZE) {
                    writeToLogAndSysOut("warning: " + file.getAbsolutePath() + " (" + file.length() + " bytes) exceeds the maximum file size of " + MAX_INDEXED_FILE_SIZE + " and thus will not be indexed\n");
                    return;
                }
                Document Document2 = HTMLDocument.Document(file, artist, url);
                writeToLogAndSysOut("adding " + Document2.get("path") + ", artist:" + Document2.get("artist") + "\n");
                writer.addDocument(Document2);
            }
        }
    }

    private static void writeToLogAndSysOut(String str) {
        try {
            if (bwLog != null) {
                bwLog.append((CharSequence) str);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
