import java.io.*; import java.lang.*; import java.util.*; /** LogSummary - summarise change activity by date in web server log(s). Kent Fitch Project Computing Pty Ltd http://www.projectcomputing.com Nov 2002 Updated KF 3Dec02 - rather than storing a single length for a URL, remember old lengths and dont treat recurrences of an old length as an update (because they *probably* aren't) This program reads 1 or more web server log files and attempts to estimate what percentages of responses are "novel" and hence archivable by pageVault. The program is very simple minded - it just inspects the date, URL, HTTP resp code and content-length fields. As the actual bytes of the response are not available, it can't really know whether a response is novel, so it guesses based on the URL and content-length (as logged). It ignores log lines it can't understand, and those with non-200 HTTP resp codes. For the remaining lines it checks whether the URL has been seen before, and if so, whether the content length has changed. If not seen before, it counts the response as "new". If seen before but with a different length, it counts the response as "updated". Otherwise, it counts the response as "same". For each day in the log(s) processed, it then generates a report showing counts, bytes and percentages of responses in each category (same, new, updated). Note that the reporting base is cumulative, so you'd expect to see fewer "new" responses as time goes on for most web sites. The parameters to this program are a series (1 or more) of input log files to read, assumed to contain log entries in ascending date order. The format of the log entries is expected to be something like this: (Apache format): 127.0.0.1 - - [17/Oct/2002:14:05:08 +1100] "GET /cgi-bin/test?parm1=+junk&parm2=x HTTP/1.1" 200 21411 This program is placed into the public domain. It may be used by anyone for any purpose. **/ public class LogSummary { LogSummary(String args[]) throws IOException { System.out.println("pageVault Log Analyzer - update volume estimator\n" + "\n" + "\n" + "

pageVault Log Analyzer - update volume estimator

\n" + "

This tool provides a very rough estimate of new and updated content " + "delivery as a percentage of total site output, and hence can be used to " + "estimate the volume of material pageVault would archive.

\n" + "

Note that this tool is not accurate because it guesses what content was new or " + "updated simply by looking at the URL and response length as logged by the web " + "server. Hence, genuinely different content with the same length is not counted " + "as different, and materially identical content with different response lengths " + "caused by immaterial variations is counted as new/updated. Nevertheless, it " + "provides a ballpark guide, but you must be aware of the effect of these " + "issues to use it to provide a more accurate predicator of pageVault archive volume." + "

\n" + "

Kent Fitch, Project Computing Pty Ltd" + "

\n") ; System.out.println("

LogSummary started " + (new Date()) + " with " + args.length + " input files

\n") ; System.out.println("" + "" + "" + "\n" + "" + "" + "" + "" + "" + "\n" + "" + "" + "\n") ; setup() ; for (int i=0;i 0) reportDailyStats() ; // report the final date stats System.out.println("
DateRequestAnalyzedSameNewUpdated
TotalCan't ParseNon 200CountVol
MB
CountVolCountVolCountVol
Num%MB%Num%MB%Num%MB%

LogSummary completed " + (new Date()) + "

\n") ; } private String lastDate = "-" ; private int unparseableLines = 0 ; private int non200Lines = 0 ; private int totalRequests = 1 ; private int analysedRequests = 0 ; private int newCount = 0 ; private int modCount = 0 ; private int sameCount = 0 ; private long newBytes = 0 ; private long modBytes = 0 ; private long sameBytes = 0 ; private HashMap urls = new HashMap(200000) ; private void setup() { } private void processFileName(String fileName) throws IOException { String byteStr ; int bytes ; String url ; BufferedReader br = new BufferedReader(new FileReader(fileName)) ; while (true) { String line = br.readLine() ; if (line == null) break ; totalRequests++ ; int datestart = line.indexOf('[') ; if (datestart < 0) { //System.out.println("no date[:" + line) ; unparseableLines++ ; continue ; } int j = line.indexOf('"', datestart) ; // find the quote at the start of the URL if (j < 0) { //System.out.println("no date quote:" + line) ; unparseableLines++ ; continue ; } int i = line.indexOf(' ', j+1) ; // skip the method if (i < 0) { //System.out.println("no method end:" + line) ; unparseableLines++ ; continue ; } // i marks the start of the url j = line.indexOf(' ', i+1) ; // end of url - space between url and HTTP version int k = line.indexOf('"', i+1) ; // end of quoted url if (k < 0) { //System.out.println("no url end quote:" + line) ; unparseableLines++ ; continue ; } int end ; if (j < 0) end = k ; // odd no space anywhere - maybe version 0.9? else if (j < k) end = j ; // normal - got the space else end = k ; // odd - no space inside quotes... if (end < i) { //System.out.println("bad url:" + line) ; unparseableLines++ ; continue ; } if (!line.substring(k+2, k+2+3).equals("200")) { non200Lines++ ; continue ; } j = line.indexOf(' ', k+2+4) ; if (j < 0) byteStr = line.substring(k+2+4) ; else byteStr = line.substring(k+2+4, j) ; try { bytes = Integer.parseInt(byteStr) ; } catch (NumberFormatException e) { //System.out.println("bad length:" + byteStr + " :" + line) ; unparseableLines++ ; continue ; } if (line.substring(datestart+1, datestart+1+11).compareTo(lastDate) != 0) { totalRequests-- ; flipDate(line.substring(datestart+1, datestart+1+11)) ; } url = line.substring(i+1, end) ; analysedRequests++ ; processReq(url, byteStr, bytes) ; } br.close() ; } private void flipDate(String newLineDate) { if (!lastDate.equals("-")) { reportDailyStats() ; clearDailyCounters() ; } lastDate = newLineDate ; } private void clearDailyCounters() { unparseableLines = 0 ; non200Lines = 0 ; totalRequests = 1 ; analysedRequests = 0 ; newCount = 0 ; modCount = 0 ; sameCount = 0 ; newBytes = 0 ; modBytes = 0 ; sameBytes = 0 ; } private void processReq(String url, String byteStr, int bytes) { HashMap hm = (HashMap) urls.get(url) ; if (hm == null) { newCount++ ; newBytes += bytes ; hm = new HashMap(5) ; hm.put(byteStr, "") ; urls.put(url, hm) ; } else if (hm.get(byteStr) == null) { //System.out.println("updated url:" + url + " old=" + hm + ", new len=" + byteStr); hm.put(byteStr, "") ; modCount++ ; modBytes += bytes ; } else { sameCount++ ; sameBytes += bytes ; } } private void reportDailyStats() { long totMB = ((newBytes + modBytes + sameBytes) + 500000) / 1000000 ; System.out.println("" + lastDate + "\n") ; System.out.println("" + totalRequests + "" + unparseableLines + "" + non200Lines+ "" + analysedRequests + "" + totMB + "\n") ; if ((totalRequests > 0) && ((newBytes + modBytes + sameBytes) > 0)) { System.out.println("" + sameCount + "" + (int) ((sameCount * 100 /totalRequests) + 0.5) + "" + (int) ((sameBytes + 500000) / 1000000) + "" + (int) ((sameBytes * 100 / (newBytes + modBytes + sameBytes)) + 0.5) + "\n") ; System.out.println("" + newCount + "" + (int) ((newCount * 100 /totalRequests) + 0.5) + "" + (int) ((newBytes + 500000) / 1000000) + "" + (int) ((newBytes * 100 / (newBytes + modBytes + sameBytes)) + 0.5) + "\n") ; System.out.println("" + modCount + "" + (int) ((modCount * 100 /totalRequests) + 0.5) + "" + (int) ((modBytes + 500000) / 1000000) + "" + (int) ((modBytes * 100 / (newBytes + modBytes + sameBytes)) + 0.5) + "\n") ; } System.out.println("") ; } public static void main (String args[]) throws IOException { if (args.length < 1) System.out.println("Supply 1 or more web server logs as parameters") ; else new LogSummary(args) ; } }