#!/usr/bin/env python

"""
Syntax: makestats.py traffic.csv monthly.csv [input html files]

Makes statistics from LiveJournal's monthly calendar view pages.
The monthly.csv file is always appended to, even if it already has data for
specified months. The year and month are extracted from the input file name,
which should be of the form YYYY-MM.html.
"""

import csv
import re
from os.path import basename

starttag = "<dl>"
endtag = "</dl>"
daytag = "<dt>"
dayre = re.compile("<b>(.*)</b>")
commenttag = " comment"
postcommentre = re.compile('(?P<time>[0-9]{2}:[0-9]{2} [apAP][mM]):.*?<b>(?P<user>.*?)</b>.*?<a href="(?P<link>http://.*?\.html)".*?>(?P<title>.*?)</a> - (?P<comments>[0-9]*) comment')
postnocommentre = re.compile('(?P<time>[0-9]{2}:[0-9]{2} [apAP][mM]):.*?<b>(?P<user>.*?)</b>.*?<a href="(?P<link>http://.*?\.html)".*?>(?P<title>.*?)</a>')

syntaxdoc = __doc__

monthindex = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
              'Sep', 'Oct', 'Nov', 'Dec']

def main(argv):
    if len(argv) < 4:
        print syntaxdoc.replace("makestats.py", argv[0])
        return -1

    traffic = csv.writer(file(argv[1], 'w'))
    monthly = csv.writer(file(argv[2], 'a'))
    
    for filename in argv[3:]:
        basefilename = basename(filename)
        year = int(basefilename[0:4])
        month = int(basefilename[5:7])
        print "Analysing %s %04d:" % (monthindex[month], year),
        lines = file(filename, 'r').readlines()
        inarea = False
        day = None
        numposts = 0
        numcomments = 0
        for line in lines:
            line = line.strip()
            if not inarea and line.find(starttag) != -1:
                inarea = True
            elif inarea and line.find(endtag) != -1:
                inarea = False
            if inarea:
                if line.find(daytag) != -1:
                    day = int(dayre.search(line).group(1)[:-2])
                elif day is not None:
                    if line.find(commenttag) != -1:
                        postmatch = postcommentre.search(line)
                    else:
                        postmatch = postnocommentre.search(line)
                    if postmatch is not None:
                        numposts += 1
                        csvlist = [year, month, day]
                        for attr in ['time', 'user', 'link', 'title', 'comments']:
                            item = postmatch.groupdict().get(attr, 0)
                            csvlist.append(item)
                        traffic.writerow(csvlist)
                        numcomments += int(csvlist[-1])
        print "%d posts, %d comments" % (numposts, numcomments)
        monthly.writerow([year, month, numposts, numcomments])

if __name__=='__main__':
    import sys
    sys.exit(main(sys.argv))