Commit d83e41cd authored by Francois Marier's avatar Francois Marier

Commit our version of gitdm for release stats

This comes from git://git.lwn.net/gitdm.git
parent f3b00ce0
The code in this directory can be distributed under the terms of the GNU
General Public License, version 2.
#
# Stuff for dealing with configuration files.
#
#
# This code is part of the LWN git data miner.
#
# Copyright 2007-11 Eklektix, Inc.
# Copyright 2007-11 Jonathan Corbet <corbet@lwn.net>
#
# This file may be distributed under the terms of the GNU General
# Public License, version 2.
#
import sys, re, datetime, os.path
import database
#
# Read a line and strip out junk.
#
def ReadConfigLine (file):
line = file.readline ()
if not line:
return None
line = line.split('#')[0] # Get rid of any comments
line = line.strip () # and extra white space
if len (line) == 0: # we got rid of everything
return ReadConfigLine (file)
return line
#
# Give up and die.
#
def croak (message):
sys.stderr.write (message + '\n')
sys.exit (1)
#
# Read a list of email aliases.
#
def ReadEmailAliases (name):
try:
file = open (name, 'r')
except IOError:
croak ('Unable to open email alias file %s' % (name))
line = ReadConfigLine (file)
while line:
m = re.match ('^("[^"]+"|\S+)\s+(.+)$', line)
if not m or len (m.groups ()) != 2:
croak ('Funky email alias line "%s"' % (line))
if m and m.group (2).find ('@') <= 0:
croak ('Non-addresses in email alias "%s"' % (line))
database.AddEmailAlias (m.group (1).replace ('"', ''), m.group (2))
line = ReadConfigLine (file)
file.close ()
#
# The Email/Employer map
#
EMMpat = re.compile (r'^([^\s]+)\s+([^<]+)\s*(<\s*(\d+-\d+-\d+)\s*)?$')
def ReadEmailEmployers (name):
try:
file = open (name, 'r')
except IOError:
croak ('Unable to open email/employer file %s' % (name))
line = ReadConfigLine (file)
while line:
m = EMMpat.match (line)
if not m:
croak ('Funky email/employer line "%s"' % (line))
email = m.group (1)
company = m.group (2).strip ()
enddate = ParseDate (m.group (4))
database.AddEmailEmployerMapping (email, company, enddate)
line = ReadConfigLine (file)
file.close ()
def ParseDate (cdate):
if not cdate:
return None
sdate = cdate.split ('-')
return datetime.date (int (sdate[0]), int (sdate[1]), int (sdate[2]))
def ReadGroupMap (fname, employer):
try:
file = open (fname, 'r')
except IOError:
croak ('Unable to open group map file %s' % (fname))
line = ReadConfigLine (file)
while line:
database.AddEmailEmployerMapping (line, employer)
line = ReadConfigLine (file)
file.close ()
#
# Read in a virtual employer description.
#
def ReadVirtual (file, name):
ve = database.VirtualEmployer (name)
line = ReadConfigLine (file)
while line:
sl = line.split (None, 1)
first = sl[0]
if first == 'end':
ve.store ()
return
#
# Zap the "%" syntactic sugar if it's there
#
if first[-1] == '%':
first = first[:-1]
try:
percent = int (first)
except ValueError:
croak ('Bad split value "%s" for virtual empl %s' % (first, name))
if not (0 < percent <= 100):
croak ('Bad split value "%s" for virtual empl %s' % (first, name))
ve.addsplit (' '.join (sl[1:]), percent/100.0)
line = ReadConfigLine (file)
#
# We should never get here
#
croak ('Missing "end" line for virtual employer %s' % (name))
#
# Read an overall config file.
#
def ConfigFile (name, confdir):
try:
file = open (name, 'r')
except IOError:
croak ('Unable to open config file %s' % (name))
line = ReadConfigLine (file)
while line:
sline = line.split (None, 2)
if len (sline) < 2:
croak ('Funky config line: "%s"' % (line))
if sline[0] == 'EmailAliases':
ReadEmailAliases (os.path.join (confdir, sline[1]))
elif sline[0] == 'EmailMap':
ReadEmailEmployers (os.path.join (confdir, sline[1]))
elif sline[0] == 'GroupMap':
if len (sline) != 3:
croak ('Funky group map line "%s"' % (line))
ReadGroupMap (os.path.join (confdir, sline[1]), sline[2])
elif sline[0] == 'VirtualEmployer':
ReadVirtual (file, ' '.join (sline[1:]))
else:
croak ('Unrecognized config line: "%s"' % (line))
line = ReadConfigLine (file)
The code in this directory makes up the "git data miner," a simple hack
which attempts to figure things out from the revision history in a git
repository.
INSTALLING GITDM
gitdm is a python script and doesn't need to be proper installed like other
normal programs. You just have to adjust your PATH variable, pointing it to
the directory of gitdm or alternatively create a symbolic link of the script
inside /usr/bin.
Before actually run gitdm you may want also to update the configuration file
(gitdm.config) with the needed information.
RUNNING GITDM
Run it like this:
git log -p -M [details] | gitdm [options]
The [details] tell git which changesets are of interest; the [options] can
be:
-a If a patch contains signoff lines from both Andrew Morton
and Linus Torvalds, omit Linus's.
-b dir Specify the base directory to fetch the configuration files.
-c file Specify the name of the gitdm configuration file.
By default, "./gitdm.config" is used.
-d Omit the developer reports, giving employer information
only.
-D Rather than create the usual statistics, create a
file (datelc) providing lines changed per day, where the first column
displays the changes happened only on that day and the second sums
the day it happnened with the previous ones. This option is suitable
for feeding to a tool like gnuplot.
-h file Generate HTML output to the given file
-l num Only list the top <num> entries in each report.
-o file Write text output to the given file (default is stdout).
-r pat Only generate statistics for changes to files whose
name matches the given regular expression.
-s Ignore Signed-off-by lines which match the author of
each patch.
-u Group all unknown developers under the "(Unknown)"
employer.
-x file Export raw statistics as CSV.
-w Aggregate the data by weeks instead of months in the
CSV file when -x is used.
-z Dump out the hacker database to "database.dump".
A typical command line used to generate the "who write 2.6.x" LWN articles
looks like:
git log -p -M v2.6.19..v2.6.20 | \
gitdm -u -s -a -o results -h results.html
CONFIGURATION FILE
The main purpose of the configuration file is to direct the mapping of
email addresses onto employers. Please note that the config file parser is
exceptionally stupid and unrobust at this point, but it gets the job done.
Blank lines and lines beginning with "#" are ignored. Everything else
specifies a file with some sort of mapping:
EmailAliases file
Developers often post code under a number of different email
addresses, but it can be desirable to group them all together in
the statistics. An EmailAliases file just contains a bunch of
lines of the form:
alias@address canonical@address
Any patches originating from alias@address will be treated as if
they had come from canonical@address.
It may happen that some people set their git user data in the
following form: "joe.hacker@acme.org <Joe Hacker>". The
"Joe Hacker" is then considered as the email... but gitdm says
it is a "Funky" email. An alias line in the following form can
be used to alias these commits aliased to the correct email
address:
"Joe Hacker" joe.hacker@acme.org
EmailMap file
Map email addresses onto employers. These files contain lines
like:
[user@]domain employer [< yyyy-mm-dd]
If the "user@" portion is missing, all email from the given domain
will be treated as being associated with the given employer. If a
date is provided, the entry is only valid up to that date;
otherwise it is considered valid into the indefinite future. This
feature can be useful for properly tracking developers' work when
they change employers but do not change email addresses.
GroupMap file employer
This is a variant of EmailMap provided for convenience; it contains
email addresses only, all of which are associated with the given
employer.
VirtualEmployer name
nn% employer1
...
end
This construct (which appears in the main configuration file)
allows causes the creation of a fake employer with the given
"name". It directs that any contributions attributed to that
employer should be split to other (real) employers using the given
percentages. The functionality works, but is primitive - there is,
for example, no check to ensure that the percentages add up to
something rational.
OTHER TOOLS
A few other tools have been added to this repository:
treeplot
Reads a set of commits, then generates a graphviz file charting the
flow of patches into the mainline. Needs to be smarter, but, then,
so does everything else in this directory.
findoldfiles
Simple brute-force crawler which outputs the names of any files
which have not been touched since the original (kernel) commit.
committags
I needed to be able to quickly associate a given commit with the
major release which contains it. First attempt used
"git tags --contains="; after it ran for a solid week, I concluded
there must be a better way. This tool just reads through the repo,
remembering tags, and creating a Python dictionary containing the
association. The result is an ugly 10mb pickle file, but, even so,
it's still a better way.
linetags
Crawls through a directory hierarchy, counting how many lines of
code are associated with each major release. Needs the pickle file
from committags to get the job done.
NOTES AND CREDITS
Gitdm was written by Jonathan Corbet; many useful contributions have come
from Greg Kroah-Hartman.
Please note that this tool is provided in the hope that it will be useful,
but it is not put forward as an example of excellence in design or
implementation. Hacking on gitdm tends to stop the moment it performs
whatever task is required of it at the moment. Patches to make it less
hacky, less ugly, and more robust are welcome.
Jonathan Corbet
corbet@lwn.net
#!/usr/bin/python
#
# Generate a database of commits and major versions they went into.
#
# committags [git-args]
#
# This code is part of the LWN git data miner.
#
# Copyright 2007-11 Eklektix, Inc.
# Copyright 2007-11 Jonathan Corbet <corbet@lwn.net>
#
# This file may be distributed under the terms of the GNU General
# Public License, version 2.
#
import sys
import re
import os
import pickle
git = 'git log --decorate '
if len(sys.argv) > 1:
git += ' '.join(sys.argv[1:])
input = os.popen(git, 'r')
DB = { }
Tag = 'None'
tagline = re.compile(r'^commit ([\da-f]+) .*tag: (v2\.6\.\d\d)')
commit = re.compile(r'^commit ([\da-f]+)')
for line in input.readlines():
if not line.startswith('commit'):
continue # This makes it go faster
m = tagline.search(line)
if m:
DB[m.group(1)] = Tag = m.group(2)
else:
m = commit.search(line)
if m:
DB[m.group(1)] = Tag
print 'Found %d commits' % (len(DB.keys()))
out = open('committags.db', 'w')
pickle.dump(DB, out)
out.close()
#
# aggregate per-month statistics for people
#
import sys, datetime
class CSVStat:
def __init__ (self, name, employer, date):
self.name = name
self.employer = employer
self.added = self.removed = 0
self.date = date
def accumulate (self, p):
self.added = self.added + p.added
self.removed = self.removed + p.removed
PeriodCommitHash = { }
def AccumulatePatch (p, Aggregate):
date = "%.2d-%.2d-01"%(p.date.year, p.date.month)
if (Aggregate == 'week'):
date = "%.2d-%.2d"%(p.date.isocalendar()[0], p.date.isocalendar()[1])
authdatekey = "%s-%s"%(p.author.name, date)
if authdatekey not in PeriodCommitHash:
empl = p.author.emailemployer (p.email, p.date)
stat = CSVStat (p.author.name, empl, date)
PeriodCommitHash[authdatekey] = stat
else:
stat = PeriodCommitHash[authdatekey]
stat.accumulate (p)
def OutputCSV (file):
if file is None:
return
file.write ("Name\tAffliation\tDate\tAdded\tRemoved\n")
for date, stat in PeriodCommitHash.items():
# sanitise names " is common and \" sometimes too
empl_name = stat.employer.name.replace ("\"", ".").replace ("\\", ".")
author_name = stat.name.replace ("\"", ".").replace ("\\", ".")
file.write ("\"%s\"\t\"%s\"\t%s\t%d\t%d\n"%(author_name, empl_name, stat.date, \
stat.added, stat.removed))
#
# The "database".
#
# This code is part of the LWN git data miner.
#
# Copyright 2007-11 Eklektix, Inc.
# Copyright 2007-11 Jonathan Corbet <corbet@lwn.net>
#
# This file may be distributed under the terms of the GNU General
# Public License, version 2.
#
import sys, datetime
class Hacker:
def __init__ (self, name, id, elist, email):
self.name = name
self.id = id
self.employer = [ elist ]
self.email = [ email ]
self.added = self.removed = 0
self.patches = [ ]
self.signoffs = [ ]
self.reviews = [ ]
self.tested = [ ]
self.reports = [ ]
self.testcred = self.repcred = 0
def addemail (self, email, elist):
self.email.append (email)
self.employer.append (elist)
HackersByEmail[email] = self
def emailemployer (self, email, date):
for i in range (0, len (self.email)):
if self.email[i] == email:
for edate, empl in self.employer[i]:
if edate > date:
return empl
print 'OOPS. ', self.name, self.employer, self.email, email, date
return None # Should not happen
def addpatch (self, patch):
self.added += patch.added
self.removed += patch.removed
self.patches.append (patch)
#
# There's got to be a better way.
#
def addsob (self, patch):
self.signoffs.append (patch)
def addreview (self, patch):
self.reviews.append (patch)
def addtested (self, patch):
self.tested.append (patch)
def addreport (self, patch):
self.reports.append (patch)
def reportcredit (self, patch):
self.repcred += 1
def testcredit (self, patch):
self.testcred += 1
HackersByName = { }
HackersByEmail = { }
HackersByID = { }
MaxID = 0
def StoreHacker (name, elist, email):
global MaxID
id = MaxID
MaxID += 1
h = Hacker (name, id, elist, email)
HackersByName[name] = h
HackersByEmail[email] = h
HackersByID[id] = h
return h
def LookupEmail (addr):
try:
return HackersByEmail[addr]
except KeyError:
return None
def LookupName (name):
try:
return HackersByName[name]
except KeyError:
return None
def LookupID (id):
try:
return HackersByID[id]
except KeyError:
return None
def AllHackers ():
return HackersByID.values ()
# return [h for h in HackersByID.values ()] # if (h.added + h.removed) > 0]
def DumpDB ():
out = open ('database.dump', 'w')
names = HackersByName.keys ()
names.sort ()
for name in names:
h = HackersByName[name]
out.write ('%4d %s %d p (+%d -%d) sob: %d\n' % (h.id, h.name,
len (h.patches),
h.added, h.removed,
len (h.signoffs)))
for i in range (0, len (h.email)):
out.write ('\t%s -> \n' % (h.email[i]))
for date, empl in h.employer[i]:
out.write ('\t\t %d-%d-%d %s\n' % (date.year, date.month, date.day,
empl.name))
#
# Employer info.
#
class Employer:
def __init__ (self, name):
self.name = name
self.added = self.removed = self.count = self.changed = 0
self.sobs = 0
self.hackers = [ ]
def AddCSet (self, patch):
self.added += patch.added
self.removed += patch.removed
self.changed += max(patch.added, patch.removed)
self.count += 1
if patch.author not in self.hackers:
self.hackers.append (patch.author)
def AddSOB (self):
self.sobs += 1
Employers = { }
def GetEmployer (name):
try:
return Employers[name]
except KeyError:
e = Employer (name)
Employers[name] = e
return e
def AllEmployers ():
return Employers.values ()
#
# Certain obnoxious developers, who will remain nameless (because we
# would never want to run afoul of Thomas) want their work split among
# multiple companies. Let's try to cope with that. Let's also hope
# this doesn't spread.
#
class VirtualEmployer (Employer):
def __init__ (self, name):
Employer.__init__ (self, name)
self.splits = [ ]
def addsplit (self, name, fraction):
self.splits.append ((name, fraction))
#
# Go through and (destructively) apply our credits to the
# real employer. Only one level of weirdness is supported.
#
def applysplits (self):
for name, fraction in self.splits:
real = GetEmployer (name)
real.added += int (self.added*fraction)
real.removed += int (self.removed*fraction)
real.changed += int (self.changed*fraction)
real.count += int (self.count*fraction)
self.__init__ (name) # Reset counts just in case
def store (self):
if Employers.has_key (self.name):
print Employers[self.name]
sys.stderr.write ('WARNING: Virtual empl %s overwrites another\n'
% (self.name))
if len (self.splits) == 0:
sys.stderr.write ('WARNING: Virtual empl %s has no splits\n'
% (self.name))
# Should check that they add up too, but I'm lazy
Employers[self.name] = self
#
# Mix all the virtual employers into their real destinations.
#
def MixVirtuals ():
for empl in AllEmployers ():
if isinstance (empl, VirtualEmployer):
empl.applysplits ()
#
# The email map.
#
EmailAliases = { }
def AddEmailAlias (variant, canonical):
if EmailAliases.has_key (variant):
sys.stderr.write ('Duplicate email alias for %s\n' % (variant))
EmailAliases[variant] = canonical
def RemapEmail (email):
email = email.lower ()
try:
return EmailAliases[email]
except KeyError:
return email
#
# Email-to-employer mapping.
#
EmailToEmployer = { }
nextyear = datetime.date.today () + datetime.timedelta (days = 365)
def AddEmailEmployerMapping (email, employer, end = nextyear):
if end is None:
end = nextyear
email = email.lower ()
empl = GetEmployer (employer)
try:
l = EmailToEmployer[email]
for i in range (0, len(l)):
date, xempl = l[i]
if date == end: # probably both nextyear
print 'WARNING: duplicate email/empl for %s' % (email)
if date > end:
l.insert (i, (end, empl))
return
l.append ((end, empl))
except KeyError:
EmailToEmployer[email] = [(end, empl)]
def MapToEmployer (email, unknown = 0):
# Somebody sometimes does s/@/ at /; let's fix it.
email = email.lower ().replace (' at ', '@')
try:
return EmailToEmployer[email]
except KeyError:
pass
namedom = email.split ('@')
if len (namedom) < 2:
print 'Oops...funky email %s' % email
return [(nextyear, GetEmployer ('Funky'))]
s = namedom[1].split ('.')
for dots in range (len (s) - 2, -1, -1):
addr = '.'.join (s[dots:])
try:
return EmailToEmployer[addr]
except KeyError:
pass
if unknown:
return [(nextyear, GetEmployer ('(Unknown)'))]
return [(nextyear, GetEmployer (email))]
def LookupEmployer (email, mapunknown = 0):
elist = MapToEmployer (email, mapunknown)
return elist # GetEmployer (ename)
#!/usr/bin/python
#
# Another quick hack of a script to find files unchanged