[UPDATE] Apache CLF web.log data mining script
[1/1] from: ryan::christiansen::intellisol::com at: 26-Mar-2001 12:31
Here is the latest version of the Apache CLF web.log data mining script
which parses the .log files and dumps the data into .csv files for use in
Excel, etc.
The latest version filters out local IP addresses, localhost, and also hits
on .gif, .jpg, .swf, .css, file extensions. Of course, the script can be
customized easily.
REBOL [
Title: "Apache CLF web.log data mining script"
Date: 26-March-2001
Name: 'Log-Parser
Version: 0.6.5
File: %log-parser.r
Home: http://www.dangeroustechnology.com
Author: "Ryan C. Christiansen"
Email: [norsepower--qwest--net]
Owner: "Ryan Christiansen"
Rights: "Copyright (C) Ryan Christiansen 2001"
Language: 'English
Charset: 'ANSI
Purpose: {
Parse Apache CLF web.log files and dump data into .csv files.
}
Comment: {
Script relies on the existence of %dns-library.r which it uses as a
dns-cache file. The first time you run the script, begin with a 0-byte
%dns-library.r file.
}
History: [
0.6.5 [26-March-2001 "Added full header to script. Added choice for
parsing local or remote log. Added filter-local-IPs function. Added
filter-file-extensions function, including filter for cobalt server error
files and robots.txt files" "Ryan"]
]
Example: {do %log-parser.r}
]
print "Do you wish to parse a (L)ocal or (R)emote file?"
choice: input
switch/default choice [
"L" [
log-file: read/lines %web.log
]
"R" [
log-file: read/lines
ftp://username:[password--domain--dom]/logs/web.log
]
][
print "Invalid choice -- session ended"
wait 4
quit
]
retrieved-library: read %dns-library.r
dns-library: parse retrieved-library none
assemble-date: func [
"Parse one line in CLF web.log format and return the date as a REBOL
date! datatype"
log-line [string!] "One line in CLF web.log format"
/local date-line date-string return-date
][
date-line: parse log-line none
date-line/4: remove date-line/4 {[}
date-line/5: remove date-line/5 {]}
date-string: rejoin [date-line/4 " " date-line/5]
return-date: make date! date-string
]
assemble-time: func [
"Parse one line in CLF web.log format and return the time as a REBOL
time! datatype"
log-line [string!] "One line in CLF web.log format"
/local date-line date-string return-time
][
date-line: parse log-line none
date-line/4: remove date-line/4 {[}
date-line/5: remove date-line/5 {]}
date-string: rejoin [date-line/4 " " date-line/5]
parse date-string [thru ":" copy text to end (return-time: make time!
text)]
return-time
]
dns-lookup: func [
"Convert an IP address to a domain name"
dns-cache "A cache of IP addresses and corresponding domain names"
IP [string!] "The IP address that needs to be converted to a domain"
/local domain
][
domain: select dns-cache IP
if ( domain == none ) [
domain: read join dns:// IP
if ( domain == none ) [
domain: "unresolved"
]
append/only dns-cache IP
append/only dns-cache domain
]
domain
]
filter-local-IPs: func [
IP-to-check [string!] "The IP address as a string parsed from the CLF
log file entry"
/local IP keeper
][
IP: make tuple! IP-to-check
keeper: true
if all [(216.115.108.1 <= IP)(IP <= 216.115.108.254)][
keeper: false keeper
] ; check local IPs
if IP = 127.0.0.1 [keeper: false keeper] ; check localhost
keeper
]
filter-file-extensions: func [
file-to-filter [string!] "A string from the CLF web.log containing the
requested file during the hit"
/local keeper
][
keeper: true
if find/any file-to-filter ".gif" [
keeper: false keeper
]
if find/any file-to-filter ".jpg" [
keeper: false keeper
]
if find/any file-to-filter ".jpeg" [
keeper: false keeper
]
if find/any file-to-filter ".bmp" [
keeper: false keeper
]
if find/any file-to-filter ".swf" [
keeper: false keeper
]
if find/any file-to-filter ".css" [
keeper: false keeper
]
; -also filter cobalt error files
if find/any file-to-filter "question_warning" [
keeper: false keeper
]
if find/any file-to-filter "lock_warning" [
keeper: false keeper
]
; -also filter requests for robots.txt files
if find/any file-to-filter "robots.txt" [
keeper: false keeper
]
keeper
]
parse-log-line: func [
"Parse one line in CLF web.log format and return the IP address, hit
date, hit time, file hit, bytes used, referring page, and browser type"
log-line [string!] "One line in CLF web.log format"
file-to-save [file!] "The name of the target file to write returned
variables"
/local current-line
][
current-line: parse log-line none
IP-address: make string! current-line/1
keep-hit: filter-local-IPs IP-address
either keep-hit = true [
domain-address: dns-lookup dns-library IP-address
hit-date: assemble-date log-line
hit-time: assemble-time log-line
hit-file: current-line/6
keep-hit-two: filter-file-extensions hit-file
either keep-hit-two = true [
hit-bytes: current-line/8
referring-page: current-line/9
browser-type: current-line/10
write/append csv-file-name (rejoin [IP-address ","
domain-address "," hit-date "," hit-time "," hit-file "," hit-bytes ","
referring-page "," browser-type newline])
][
ignore: copy [] ; do nothing
]
][
ignore: copy [] ; do nothing
]
]
checksum-date: assemble-date log-file/1
csv-file-name: make file! (rejoin [checksum-date ".csv"])
log-directory: read %.
either find/any log-directory csv-file-name [
foreach log-line log-file [
current-line: parse log-line none
hit-date: assemble-date log-line
either not-equal? hit-date checksum-date [
csv-file-name: make file! (rejoin [hit-date ".csv"])
write csv-file-name {User IP Address, User Domain Address, Date
Hit, Time Hit, File Hit, Bytes Transferred, Referring Page, Browser Type}
write/append csv-file-name (newline newline)
if error? try [parse-log-line log-line csv-file-name][next
log-file]
checksum-date: hit-date
][
if error? try [parse-log-line log-line csv-file-name][next
log-file]
next log-file
]
]
][
write csv-file-name {User IP Address, User Domain Address, Date Hit,
Time Hit, File Hit, Bytes Transferred, Referring Page, Browser Type}
write/append csv-file-name (newline newline)
foreach log-line log-file [
current-line: parse log-line none
hit-date: assemble-date log-line
either not-equal? hit-date checksum-date [
csv-file-name: make file! (rejoin [hit-date ".csv"])
write csv-file-name {User IP Address, User Domain Address,
Date Hit, Time Hit, File Hit, Bytes Transferred, Referring Page, Browser
Type}
write/append csv-file-name (newline newline)
if error? try [parse-log-line log-line csv-file-name][next
log-file]
checksum-date: hit-date
][
if error? try [parse-log-line log-line csv-file-name][next
log-file]
next log-file
]
]
]
write %dns-library.r ""
foreach library-entry dns-library [
write/append %dns-library.r library-entry
write/append %dns-library.r " "
]