From d83e41cd98cc4878aa8f4844e13cb58a6ab19d3e Mon Sep 17 00:00:00 2001 From: Francois Marier Date: Wed, 15 Jun 2011 16:58:17 +1200 Subject: [PATCH] Commit our version of gitdm for release stats This comes from git://git.lwn.net/gitdm.git --- gitdm/.gitignore | 2 + gitdm/COPYING | 2 + gitdm/ConfigFile.py | 152 +++++++++++++++ gitdm/README | 178 +++++++++++++++++ gitdm/committags | 45 +++++ gitdm/csv.py | 40 ++++ gitdm/database.py | 265 ++++++++++++++++++++++++++ gitdm/findoldfiles | 35 ++++ gitdm/gitdm | 412 ++++++++++++++++++++++++++++++++++++++++ gitdm/gitdm.config | 22 +++ gitdm/linetags | 85 +++++++++ gitdm/mahara.aliases | 4 + gitdm/mahara.config | 22 +++ gitdm/mahara.domain-map | 13 ++ gitdm/patterns.py | 39 ++++ gitdm/reports.py | 343 +++++++++++++++++++++++++++++++++ gitdm/treeplot | 333 ++++++++++++++++++++++++++++++++ 17 files changed, 1992 insertions(+) create mode 100644 gitdm/.gitignore create mode 100644 gitdm/COPYING create mode 100644 gitdm/ConfigFile.py create mode 100644 gitdm/README create mode 100755 gitdm/committags create mode 100644 gitdm/csv.py create mode 100644 gitdm/database.py create mode 100755 gitdm/findoldfiles create mode 100755 gitdm/gitdm create mode 100644 gitdm/gitdm.config create mode 100755 gitdm/linetags create mode 100644 gitdm/mahara.aliases create mode 100644 gitdm/mahara.config create mode 100644 gitdm/mahara.domain-map create mode 100644 gitdm/patterns.py create mode 100644 gitdm/reports.py create mode 100755 gitdm/treeplot diff --git a/gitdm/.gitignore b/gitdm/.gitignore new file mode 100644 index 0000000..f3d74a9 --- /dev/null +++ b/gitdm/.gitignore @@ -0,0 +1,2 @@ +*.pyc +*~ diff --git a/gitdm/COPYING b/gitdm/COPYING new file mode 100644 index 0000000..fe3eb43 --- /dev/null +++ b/gitdm/COPYING @@ -0,0 +1,2 @@ +The code in this directory can be distributed under the terms of the GNU +General Public License, version 2. diff --git a/gitdm/ConfigFile.py b/gitdm/ConfigFile.py new file mode 100644 index 0000000..32a4aec --- /dev/null +++ b/gitdm/ConfigFile.py @@ -0,0 +1,152 @@ +# +# Stuff for dealing with configuration files. +# +# +# This code is part of the LWN git data miner. +# +# Copyright 2007-11 Eklektix, Inc. +# Copyright 2007-11 Jonathan Corbet +# +# This file may be distributed under the terms of the GNU General +# Public License, version 2. +# +import sys, re, datetime, os.path +import database + +# +# Read a line and strip out junk. +# +def ReadConfigLine (file): + line = file.readline () + if not line: + return None + line = line.split('#')[0] # Get rid of any comments + line = line.strip () # and extra white space + if len (line) == 0: # we got rid of everything + return ReadConfigLine (file) + return line + +# +# Give up and die. +# +def croak (message): + sys.stderr.write (message + '\n') + sys.exit (1) + +# +# Read a list of email aliases. +# +def ReadEmailAliases (name): + try: + file = open (name, 'r') + except IOError: + croak ('Unable to open email alias file %s' % (name)) + line = ReadConfigLine (file) + while line: + m = re.match ('^("[^"]+"|\S+)\s+(.+)$', line) + if not m or len (m.groups ()) != 2: + croak ('Funky email alias line "%s"' % (line)) + if m and m.group (2).find ('@') <= 0: + croak ('Non-addresses in email alias "%s"' % (line)) + database.AddEmailAlias (m.group (1).replace ('"', ''), m.group (2)) + line = ReadConfigLine (file) + file.close () + +# +# The Email/Employer map +# +EMMpat = re.compile (r'^([^\s]+)\s+([^<]+)\s*(<\s*(\d+-\d+-\d+)\s*)?$') + +def ReadEmailEmployers (name): + try: + file = open (name, 'r') + except IOError: + croak ('Unable to open email/employer file %s' % (name)) + line = ReadConfigLine (file) + while line: + m = EMMpat.match (line) + if not m: + croak ('Funky email/employer line "%s"' % (line)) + email = m.group (1) + company = m.group (2).strip () + enddate = ParseDate (m.group (4)) + database.AddEmailEmployerMapping (email, company, enddate) + line = ReadConfigLine (file) + file.close () + +def ParseDate (cdate): + if not cdate: + return None + sdate = cdate.split ('-') + return datetime.date (int (sdate[0]), int (sdate[1]), int (sdate[2])) + + +def ReadGroupMap (fname, employer): + try: + file = open (fname, 'r') + except IOError: + croak ('Unable to open group map file %s' % (fname)) + line = ReadConfigLine (file) + while line: + database.AddEmailEmployerMapping (line, employer) + line = ReadConfigLine (file) + file.close () + +# +# Read in a virtual employer description. +# +def ReadVirtual (file, name): + ve = database.VirtualEmployer (name) + line = ReadConfigLine (file) + while line: + sl = line.split (None, 1) + first = sl[0] + if first == 'end': + ve.store () + return + # + # Zap the "%" syntactic sugar if it's there + # + if first[-1] == '%': + first = first[:-1] + try: + percent = int (first) + except ValueError: + croak ('Bad split value "%s" for virtual empl %s' % (first, name)) + if not (0 < percent <= 100): + croak ('Bad split value "%s" for virtual empl %s' % (first, name)) + ve.addsplit (' '.join (sl[1:]), percent/100.0) + line = ReadConfigLine (file) + # + # We should never get here + # + croak ('Missing "end" line for virtual employer %s' % (name)) + +# +# Read an overall config file. +# + +def ConfigFile (name, confdir): + try: + file = open (name, 'r') + except IOError: + croak ('Unable to open config file %s' % (name)) + line = ReadConfigLine (file) + while line: + sline = line.split (None, 2) + if len (sline) < 2: + croak ('Funky config line: "%s"' % (line)) + if sline[0] == 'EmailAliases': + ReadEmailAliases (os.path.join (confdir, sline[1])) + elif sline[0] == 'EmailMap': + ReadEmailEmployers (os.path.join (confdir, sline[1])) + elif sline[0] == 'GroupMap': + if len (sline) != 3: + croak ('Funky group map line "%s"' % (line)) + ReadGroupMap (os.path.join (confdir, sline[1]), sline[2]) + elif sline[0] == 'VirtualEmployer': + ReadVirtual (file, ' '.join (sline[1:])) + else: + croak ('Unrecognized config line: "%s"' % (line)) + line = ReadConfigLine (file) + diff --git a/gitdm/README b/gitdm/README new file mode 100644 index 0000000..7226541 --- /dev/null +++ b/gitdm/README @@ -0,0 +1,178 @@ +The code in this directory makes up the "git data miner," a simple hack +which attempts to figure things out from the revision history in a git +repository. + + +INSTALLING GITDM + +gitdm is a python script and doesn't need to be proper installed like other +normal programs. You just have to adjust your PATH variable, pointing it to +the directory of gitdm or alternatively create a symbolic link of the script +inside /usr/bin. + +Before actually run gitdm you may want also to update the configuration file +(gitdm.config) with the needed information. + + +RUNNING GITDM + +Run it like this: + + git log -p -M [details] | gitdm [options] + +The [details] tell git which changesets are of interest; the [options] can +be: + + -a If a patch contains signoff lines from both Andrew Morton + and Linus Torvalds, omit Linus's. + + -b dir Specify the base directory to fetch the configuration files. + + -c file Specify the name of the gitdm configuration file. + By default, "./gitdm.config" is used. + + -d Omit the developer reports, giving employer information + only. + + -D Rather than create the usual statistics, create a + file (datelc) providing lines changed per day, where the first column + displays the changes happened only on that day and the second sums + the day it happnened with the previous ones. This option is suitable + for feeding to a tool like gnuplot. + + -h file Generate HTML output to the given file + + -l num Only list the top entries in each report. + + -o file Write text output to the given file (default is stdout). + + -r pat Only generate statistics for changes to files whose + name matches the given regular expression. + + -s Ignore Signed-off-by lines which match the author of + each patch. + + -u Group all unknown developers under the "(Unknown)" + employer. + + -x file Export raw statistics as CSV. + + -w Aggregate the data by weeks instead of months in the + CSV file when -x is used. + + -z Dump out the hacker database to "database.dump". + +A typical command line used to generate the "who write 2.6.x" LWN articles +looks like: + + git log -p -M v2.6.19..v2.6.20 | \ + gitdm -u -s -a -o results -h results.html + + +CONFIGURATION FILE + +The main purpose of the configuration file is to direct the mapping of +email addresses onto employers. Please note that the config file parser is +exceptionally stupid and unrobust at this point, but it gets the job done. + +Blank lines and lines beginning with "#" are ignored. Everything else +specifies a file with some sort of mapping: + +EmailAliases file + + Developers often post code under a number of different email + addresses, but it can be desirable to group them all together in + the statistics. An EmailAliases file just contains a bunch of + lines of the form: + + alias@address canonical@address + + Any patches originating from alias@address will be treated as if + they had come from canonical@address. + + It may happen that some people set their git user data in the + following form: "joe.hacker@acme.org ". The + "Joe Hacker" is then considered as the email... but gitdm says + it is a "Funky" email. An alias line in the following form can + be used to alias these commits aliased to the correct email + address: + + "Joe Hacker" joe.hacker@acme.org + + +EmailMap file + + Map email addresses onto employers. These files contain lines + like: + + [user@]domain employer [< yyyy-mm-dd] + + If the "user@" portion is missing, all email from the given domain + will be treated as being associated with the given employer. If a + date is provided, the entry is only valid up to that date; + otherwise it is considered valid into the indefinite future. This + feature can be useful for properly tracking developers' work when + they change employers but do not change email addresses. + + +GroupMap file employer + + This is a variant of EmailMap provided for convenience; it contains + email addresses only, all of which are associated with the given + employer. + +VirtualEmployer name + nn% employer1 + ... +end + + This construct (which appears in the main configuration file) + allows causes the creation of a fake employer with the given + "name". It directs that any contributions attributed to that + employer should be split to other (real) employers using the given + percentages. The functionality works, but is primitive - there is, + for example, no check to ensure that the percentages add up to + something rational. + + +OTHER TOOLS + +A few other tools have been added to this repository: + + treeplot + Reads a set of commits, then generates a graphviz file charting the + flow of patches into the mainline. Needs to be smarter, but, then, + so does everything else in this directory. + + findoldfiles + Simple brute-force crawler which outputs the names of any files + which have not been touched since the original (kernel) commit. + + committags + I needed to be able to quickly associate a given commit with the + major release which contains it. First attempt used + "git tags --contains="; after it ran for a solid week, I concluded + there must be a better way. This tool just reads through the repo, + remembering tags, and creating a Python dictionary containing the + association. The result is an ugly 10mb pickle file, but, even so, + it's still a better way. + + linetags + Crawls through a directory hierarchy, counting how many lines of + code are associated with each major release. Needs the pickle file + from committags to get the job done. + + +NOTES AND CREDITS + +Gitdm was written by Jonathan Corbet; many useful contributions have come +from Greg Kroah-Hartman. + +Please note that this tool is provided in the hope that it will be useful, +but it is not put forward as an example of excellence in design or +implementation. Hacking on gitdm tends to stop the moment it performs +whatever task is required of it at the moment. Patches to make it less +hacky, less ugly, and more robust are welcome. + +Jonathan Corbet +corbet@lwn.net diff --git a/gitdm/committags b/gitdm/committags new file mode 100755 index 0000000..39a532d --- /dev/null +++ b/gitdm/committags @@ -0,0 +1,45 @@ +#!/usr/bin/python +# +# Generate a database of commits and major versions they went into. +# +# committags [git-args] +# +# This code is part of the LWN git data miner. +# +# Copyright 2007-11 Eklektix, Inc. +# Copyright 2007-11 Jonathan Corbet +# +# This file may be distributed under the terms of the GNU General +# Public License, version 2. +# +import sys +import re +import os +import pickle + +git = 'git log --decorate ' +if len(sys.argv) > 1: + git += ' '.join(sys.argv[1:]) +input = os.popen(git, 'r') + +DB = { } +Tag = 'None' + +tagline = re.compile(r'^commit ([\da-f]+) .*tag: (v2\.6\.\d\d)') +commit = re.compile(r'^commit ([\da-f]+)') + +for line in input.readlines(): + if not line.startswith('commit'): + continue # This makes it go faster + m = tagline.search(line) + if m: + DB[m.group(1)] = Tag = m.group(2) + else: + m = commit.search(line) + if m: + DB[m.group(1)] = Tag + +print 'Found %d commits' % (len(DB.keys())) +out = open('committags.db', 'w') +pickle.dump(DB, out) +out.close() diff --git a/gitdm/csv.py b/gitdm/csv.py new file mode 100644 index 0000000..cec1f06 --- /dev/null +++ b/gitdm/csv.py @@ -0,0 +1,40 @@ +# +# aggregate per-month statistics for people +# +import sys, datetime + +class CSVStat: + def __init__ (self, name, employer, date): + self.name = name + self.employer = employer + self.added = self.removed = 0 + self.date = date + def accumulate (self, p): + self.added = self.added + p.added + self.removed = self.removed + p.removed + +PeriodCommitHash = { } + +def AccumulatePatch (p, Aggregate): + date = "%.2d-%.2d-01"%(p.date.year, p.date.month) + if (Aggregate == 'week'): + date = "%.2d-%.2d"%(p.date.isocalendar()[0], p.date.isocalendar()[1]) + authdatekey = "%s-%s"%(p.author.name, date) + if authdatekey not in PeriodCommitHash: + empl = p.author.emailemployer (p.email, p.date) + stat = CSVStat (p.author.name, empl, date) + PeriodCommitHash[authdatekey] = stat + else: + stat = PeriodCommitHash[authdatekey] + stat.accumulate (p) + +def OutputCSV (file): + if file is None: + return + file.write ("Name\tAffliation\tDate\tAdded\tRemoved\n") + for date, stat in PeriodCommitHash.items(): + # sanitise names " is common and \" sometimes too + empl_name = stat.employer.name.replace ("\"", ".").replace ("\\", ".") + author_name = stat.name.replace ("\"", ".").replace ("\\", ".") + file.write ("\"%s\"\t\"%s\"\t%s\t%d\t%d\n"%(author_name, empl_name, stat.date, \ + stat.added, stat.removed)) diff --git a/gitdm/database.py b/gitdm/database.py new file mode 100644 index 0000000..b5d9382 --- /dev/null +++ b/gitdm/database.py @@ -0,0 +1,265 @@ +# +# The "database". +# +# This code is part of the LWN git data miner. +# +# Copyright 2007-11 Eklektix, Inc. +# Copyright 2007-11 Jonathan Corbet +# +# This file may be distributed under the terms of the GNU General +# Public License, version 2. +# +import sys, datetime + + +class Hacker: + def __init__ (self, name, id, elist, email): + self.name = name + self.id = id + self.employer = [ elist ] + self.email = [ email ] + self.added = self.removed = 0 + self.patches = [ ] + self.signoffs = [ ] + self.reviews = [ ] + self.tested = [ ] + self.reports = [ ] + self.testcred = self.repcred = 0 + + def addemail (self, email, elist): + self.email.append (email) + self.employer.append (elist) + HackersByEmail[email] = self + + def emailemployer (self, email, date): + for i in range (0, len (self.email)): + if self.email[i] == email: + for edate, empl in self.employer[i]: + if edate > date: + return empl + print 'OOPS. ', self.name, self.employer, self.email, email, date + return None # Should not happen + + def addpatch (self, patch): + self.added += patch.added + self.removed += patch.removed + self.patches.append (patch) + + # + # There's got to be a better way. + # + def addsob (self, patch): + self.signoffs.append (patch) + def addreview (self, patch): + self.reviews.append (patch) + def addtested (self, patch): + self.tested.append (patch) + def addreport (self, patch): + self.reports.append (patch) + + def reportcredit (self, patch): + self.repcred += 1 + def testcredit (self, patch): + self.testcred += 1 + +HackersByName = { } +HackersByEmail = { } +HackersByID = { } +MaxID = 0 + +def StoreHacker (name, elist, email): + global MaxID + + id = MaxID + MaxID += 1 + h = Hacker (name, id, elist, email) + HackersByName[name] = h + HackersByEmail[email] = h + HackersByID[id] = h + return h + +def LookupEmail (addr): + try: + return HackersByEmail[addr] + except KeyError: + return None + +def LookupName (name): + try: + return HackersByName[name] + except KeyError: + return None + +def LookupID (id): + try: + return HackersByID[id] + except KeyError: + return None + +def AllHackers (): + return HackersByID.values () +# return [h for h in HackersByID.values ()] # if (h.added + h.removed) > 0] + +def DumpDB (): + out = open ('database.dump', 'w') + names = HackersByName.keys () + names.sort () + for name in names: + h = HackersByName[name] + out.write ('%4d %s %d p (+%d -%d) sob: %d\n' % (h.id, h.name, + len (h.patches), + h.added, h.removed, + len (h.signoffs))) + for i in range (0, len (h.email)): + out.write ('\t%s -> \n' % (h.email[i])) + for date, empl in h.employer[i]: + out.write ('\t\t %d-%d-%d %s\n' % (date.year, date.month, date.day, + empl.name)) + +# +# Employer info. +# +class Employer: + def __init__ (self, name): + self.name = name + self.added = self.removed = self.count = self.changed = 0 + self.sobs = 0 + self.hackers = [ ] + + def AddCSet (self, patch): + self.added += patch.added + self.removed += patch.removed + self.changed += max(patch.added, patch.removed) + self.count += 1 + if patch.author not in self.hackers: + self.hackers.append (patch.author) + + def AddSOB (self): + self.sobs += 1 + +Employers = { } + +def GetEmployer (name): + try: + return Employers[name] + except KeyError: + e = Employer (name) + Employers[name] = e + return e + +def AllEmployers (): + return Employers.values () + +# +# Certain obnoxious developers, who will remain nameless (because we +# would never want to run afoul of Thomas) want their work split among +# multiple companies. Let's try to cope with that. Let's also hope +# this doesn't spread. +# +class VirtualEmployer (Employer): + def __init__ (self, name): + Employer.__init__ (self, name) + self.splits = [ ] + + def addsplit (self, name, fraction): + self.splits.append ((name, fraction)) + + # + # Go through and (destructively) apply our credits to the + # real employer. Only one level of weirdness is supported. + # + def applysplits (self): + for name, fraction in self.splits: + real = GetEmployer (name) + real.added += int (self.added*fraction) + real.removed += int (self.removed*fraction) + real.changed += int (self.changed*fraction) + real.count += int (self.count*fraction) + self.__init__ (name) # Reset counts just in case + + def store (self): + if Employers.has_key (self.name): + print Employers[self.name] + sys.stderr.write ('WARNING: Virtual empl %s overwrites another\n' + % (self.name)) + if len (self.splits) == 0: + sys.stderr.write ('WARNING: Virtual empl %s has no splits\n' + % (self.name)) + # Should check that they add up too, but I'm lazy + Employers[self.name] = self + +# +# Mix all the virtual employers into their real destinations. +# +def MixVirtuals (): + for empl in AllEmployers (): + if isinstance (empl, VirtualEmployer): + empl.applysplits () + +# +# The email map. +# +EmailAliases = { } + +def AddEmailAlias (variant, canonical): + if EmailAliases.has_key (variant): + sys.stderr.write ('Duplicate email alias for %s\n' % (variant)) + EmailAliases[variant] = canonical + +def RemapEmail (email): + email = email.lower () + try: + return EmailAliases[email] + except KeyError: + return email + +# +# Email-to-employer mapping. +# +EmailToEmployer = { } +nextyear = datetime.date.today () + datetime.timedelta (days = 365) + +def AddEmailEmployerMapping (email, employer, end = nextyear): + if end is None: + end = nextyear + email = email.lower () + empl = GetEmployer (employer) + try: + l = EmailToEmployer[email] + for i in range (0, len(l)): + date, xempl = l[i] + if date == end: # probably both nextyear + print 'WARNING: duplicate email/empl for %s' % (email) + if date > end: + l.insert (i, (end, empl)) + return + l.append ((end, empl)) + except KeyError: + EmailToEmployer[email] = [(end, empl)] + +def MapToEmployer (email, unknown = 0): + # Somebody sometimes does s/@/ at /; let's fix it. + email = email.lower ().replace (' at ', '@') + try: + return EmailToEmployer[email] + except KeyError: + pass + namedom = email.split ('@') + if len (namedom) < 2: + print 'Oops...funky email %s' % email + return [(nextyear, GetEmployer ('Funky'))] + s = namedom[1].split ('.') + for dots in range (len (s) - 2, -1, -1): + addr = '.'.join (s[dots:]) + try: + return EmailToEmployer[addr] + except KeyError: + pass + if unknown: + return [(nextyear, GetEmployer ('(Unknown)'))] + return [(nextyear, GetEmployer (email))] + + +def LookupEmployer (email, mapunknown = 0): + elist = MapToEmployer (email, mapunknown) + return elist # GetEmployer (ename) diff --git a/gitdm/findoldfiles b/gitdm/findoldfiles new file mode 100755 index 0000000..493d5d3 --- /dev/null +++ b/gitdm/findoldfiles @@ -0,0 +1,35 @@ +#!/usr/bin/python +# +# Another quick hack of a script to find files unchanged +# since a given commit. +# +# This code is part of the LWN git data miner. +# +# Copyright 2007-11 Eklektix, Inc. +# Copyright 2007-11 Jonathan Corbet +# +# This file may be distributed under the terms of the GNU General +# Public License, version 2. +# +import sys, os + +OriginalSin = '1da177e4c3f41524e886b7f1b8a0c1fc7321cac2' + +def CheckFile(file): + git = os.popen('git log --pretty=oneline -1 ' + file, 'r') + line = git.readline() + if line.startswith(OriginalSin): + print file + git.close() +# +# Here we just plow through all the files. +# +if len(sys.argv) != 2: + sys.stderr.write('Usage: findoldfiles directory\n') + sys.exit(1) + +os.chdir(sys.argv[1]) +files = os.popen('/usr/bin/find . -type f', 'r') +for file in files.readlines(): + if file.find('.git/') < 0: + CheckFile(file[:-1]) diff --git a/gitdm/gitdm b/gitdm/gitdm new file mode 100755 index 0000000..8133c22 --- /dev/null +++ b/gitdm/gitdm @@ -0,0 +1,412 @@ +#!/usr/bin/python +# + +# +# This code is part of the LWN git data miner. +# +# Copyright 2007-11 Eklektix, Inc. +# Copyright 2007-11 Jonathan Corbet +# +# This file may be distributed under the terms of the GNU General +# Public License, version 2. + + +import database, csv, ConfigFile, reports +import getopt, datetime +import os, re, sys, rfc822, string +from patterns import * + +Today = datetime.date.today() + +# +# Remember author names we have griped about. +# +GripedAuthorNames = [ ] + +# +# Control options. +# +MapUnknown = 0 +DevReports = 1 +DateStats = 0 +AuthorSOBs = 1 +FileFilter = None +CSVFile = None +AkpmOverLt = 0 +DumpDB = 0 +CFName = 'gitdm.config' +DirName = '' +Aggregate = 'month' + +# +# Options: +# +# -a Andrew Morton's signoffs shadow Linus's +# -b dir Specify the base directory to fetch the configuration files +# -c cfile Specify a configuration file +# -d Output individual developer stats +# -D Output date statistics +# -h hfile HTML output to hfile +# -l count Maximum length for output lists +# -o file File for text output +# -r pattern Restrict to files matching pattern +# -s Ignore author SOB lines +# -u Map unknown employers to '(Unknown)' +# -x file.csv Export raw statistics as CSV +# -w Aggregrate the raw statistics by weeks instead of months +# -z Dump out the hacker database at completion + +def ParseOpts (): + global MapUnknown, DevReports + global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB + global CFName, CSVFile, DirName, Aggregate + + opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:o:r:suwx:z') + for opt in opts: + if opt[0] == '-a': + AkpmOverLt = 1 + elif opt[0] == '-b': + DirName = opt[1] + elif opt[0] == '-c': + CFName = opt[1] + elif opt[0] == '-d': + DevReports = 0 + elif opt[0] == '-D': + DateStats = 1 + elif opt[0] == '-h': + reports.SetHTMLOutput (open (opt[1], 'w')) + elif opt[0] == '-l': + reports.SetMaxList (int (opt[1])) + elif opt[0] == '-o': + reports.SetOutput (open (opt[1], 'w')) + elif opt[0] == '-r': + print 'Filter on "%s"' % (opt[1]) + FileFilter = re.compile (opt[1]) + elif opt[0] == '-s': + AuthorSOBs = 0 + elif opt[0] == '-u': + MapUnknown = 1 + elif opt[0] == '-x': + CSVFile = open (opt[1], 'w') + print "open output file " + opt[1] + "\n" + elif opt [0] == '-w': + Aggregate = 'week' + elif opt[0] == '-z': + DumpDB = 1 + + + +def LookupStoreHacker (name, email): + email = database.RemapEmail (email) + h = database.LookupEmail (email) + if h: # already there + return h + elist = database.LookupEmployer (email, MapUnknown) + h = database.LookupName (name) + if h: # new email + h.addemail (email, elist) + return h + return database.StoreHacker(name, elist, email) + +# +# Date tracking. +# + +DateMap = { } + +def AddDateLines(date, lines): + if lines > 1000000: + print 'Skip big patch (%d)' % lines + return + try: + DateMap[date] += lines + except KeyError: + DateMap[date] = lines + +def PrintDateStats(): + dates = DateMap.keys () + dates.sort () + total = 0 + datef = open ('datelc.csv', 'w') + datef.write('Date,Changed,Total Changed\n') + for date in dates: + total += DateMap[date] + datef.write ('%d/%02d/%02d,%d,%d\n' % (date.year, date.month, date.day, + DateMap[date], total)) + + +# +# Let's slowly try to move some smarts into this class. +# +class patch: + def __init__ (self, commit): + self.commit = commit + self.merge = self.added = self.removed = 0 + self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net') + self.email = 'unknown@hacker.net' + self.sobs = [ ] + self.reviews = [ ] + self.testers = [ ] + self.reports = [ ] + + def addreviewer (self, reviewer): + self.reviews.append (reviewer) + + def addtester (self, tester): + self.testers.append (tester) + + def addreporter (self, reporter): + self.reports.append (reporter) +# +# The core hack for grabbing the information about a changeset. +# +def grabpatch(): + global NextLine + + while (1): + m = Pcommit.match (NextLine) + if m: + break; + NextLine = sys.stdin.readline () + if not NextLine: + return + + p = patch(m.group (1)) + NextLine = sys.stdin.readline () + ignore = (FileFilter is not None) + while NextLine: + Line = NextLine + # + # If this line starts a new commit, drop out. + # + m = Pcommit.match (Line) + if m: + break + NextLine = sys.stdin.readline () + # + # Maybe it's an author line? + # + m = Pauthor.match (Line) + if m: + p.email = database.RemapEmail (m.group (2)) + p.author = LookupStoreHacker(m.group (1), p.email) + continue + # + # Could be a signed-off-by: + # + m = Psob.match (Line) + if m: + email = database.RemapEmail (m.group (2)) + sobber = LookupStoreHacker(m.group (1), email) + if sobber != p.author or AuthorSOBs: + p.sobs.append ((email, LookupStoreHacker(m.group (1), m.group (2)))) + continue + # + # Various other tags of interest. + # + m = Preview.match (Line) # Reviewed-by: + if m: + email = database.RemapEmail (m.group (2)) + p.addreviewer (LookupStoreHacker(m.group (1), email)) + continue + m = Ptest.match (Line) # Tested-by: + if m: + email = database.RemapEmail (m.group (2)) + p.addtester (LookupStoreHacker (m.group (1), email)) + p.author.testcredit (patch) + continue + m = Prep.match (Line) # Reported-by: + if m: + email = database.RemapEmail (m.group (2)) + p.addreporter (LookupStoreHacker (m.group (1), email)) + p.author.reportcredit (patch) + continue + m = Preptest.match (Line) # Reported-and-tested-by: + if m: + email = database.RemapEmail (m.group (2)) + h = LookupStoreHacker (m.group (1), email) + p.addreporter (h) + p.addtester (h) + p.author.reportcredit (patch) + p.author.testcredit (patch) + continue + # + # If this one is a merge, make note of the fact. + # + m = Pmerge.match (Line) + if m: + p.merge = 1 + continue + # + # See if it's the date. + # + m = Pdate.match (Line) + if m: + dt = rfc822.parsedate(m.group (2)) + p.date = datetime.date (dt[0], dt[1], dt[2]) + if p.date > Today: + sys.stderr.write ('Funky date: %s\n' % p.date) + p.date = Today + continue + # + # If we have a file filter, check for file lines. + # + if FileFilter: + ignore = ApplyFileFilter (Line, ignore) + # + # OK, maybe it's part of the diff itself. + # + if not ignore: + if Padd.match (Line): + p.added += 1 + continue + if Prem.match (Line): + p.removed += 1 + + if '@' in p.author.name: + GripeAboutAuthorName (p.author.name) + + return p + +def GripeAboutAuthorName (name): + if name in GripedAuthorNames: + return + GripedAuthorNames.append (name) + print '%s is an author name, probably not what you want' % (name) + +def ApplyFileFilter (line, ignore): + # + # If this is the first file line (--- a/), set ignore one way + # or the other. + # + m = Pfilea.match (line) + if m: + file = m.group (1) + if FileFilter.search (file): + return 0 + return 1 + # + # For the second line, we can turn ignore off, but not on + # + m = Pfileb.match (line) + if m: + file = m.group (1) + if FileFilter.search (file): + return 0 + return ignore + +# +# If this patch is signed off by both Andrew Morton and Linus Torvalds, +# remove the (redundant) Linus signoff. +# +def TrimLTSOBs (p): + if AkpmOverLt == 1 and Linus in p.sobs and Akpm in p.sobs: + p.sobs.remove (Linus) + + +# +# Here starts the real program. +# +ParseOpts () + +# +# Read the config files. +# +ConfigFile.ConfigFile (CFName, DirName) + +# +# Let's pre-seed the database with a couple of hackers +# we want to remember. +# +if AkpmOverLt == 1: + Linus = ('torvalds@linux-foundation.org', + LookupStoreHacker ('Linus Torvalds', 'torvalds@linux-foundation.org')) + Akpm = ('akpm@linux-foundation.org', + LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org')) + +NextLine = sys.stdin.readline () +TotalChanged = TotalAdded = TotalRemoved = 0 + +# +# Snarf changesets. +# +print >> sys.stderr, 'Grabbing changesets...\r', + +printcount = CSCount = 0 +while (1): + if (printcount % 50) == 0: + print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount, + printcount += 1 + p = grabpatch() + if not p: + break +# if p.added > 100000 or p.removed > 100000: +# print 'Skipping massive add', p.commit +# continue + if FileFilter and p.added == 0 and p.removed == 0: + continue + + # + # Record some global information - but only if this patch had + # stuff which wasn't ignored. + # + if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge: + TotalAdded += p.added + TotalRemoved += p.removed + TotalChanged += max (p.added, p.removed) + AddDateLines (p.date, max (p.added, p.removed)) + empl = p.author.emailemployer (p.email, p.date) + empl.AddCSet (p) + if AkpmOverLt: + TrimLTSOBs (p) + for sobemail, sobber in p.sobs: + empl = sobber.emailemployer (sobemail, p.date) + empl.AddSOB() + + if not p.merge: + p.author.addpatch (p) + for sobemail, sob in p.sobs: + sob.addsob (p) + for hacker in p.reviews: + hacker.addreview (p) + for hacker in p.testers: + hacker.addtested (p) + for hacker in p.reports: + hacker.addreport (p) + CSCount += 1 + csv.AccumulatePatch (p, Aggregate) +print >> sys.stderr, 'Grabbing changesets...done ' + +if DumpDB: + database.DumpDB () +database.MixVirtuals () + +# +# Say something +# +hlist = database.AllHackers () +elist = database.AllEmployers () +ndev = nempl = 0 +for h in hlist: + if len (h.patches) > 0: + ndev += 1 +for e in elist: + if e.count > 0: + nempl += 1 +reports.Write ('Processed %d csets from %d developers\n' % (CSCount, + ndev)) +reports.Write ('%d employers found\n' % (nempl)) +reports.Write ('A total of %d lines added, %d removed (delta %d)\n' % + (TotalAdded, TotalRemoved, TotalAdded - TotalRemoved)) +if TotalChanged == 0: + TotalChanged = 1 # HACK to avoid div by zero +if DateStats: + PrintDateStats () + +csv.OutputCSV (CSVFile) +if CSVFile is not None: + CSVFile.close () + +if DevReports: + reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved) +reports.EmplReports (elist, TotalChanged, CSCount) diff --git a/gitdm/gitdm.config b/gitdm/gitdm.config new file mode 100644 index 0000000..588d6ef --- /dev/null +++ b/gitdm/gitdm.config @@ -0,0 +1,22 @@ +# +# This is a sample gitdm configuration file. +# + +# +# EmailAliases lets us cope with developers who use more +# than one address. +# +EmailAliases sample-config/aliases + +# +# EmailMap does the main work of mapping addresses onto +# employers. +# +EmailMap sample-config/domain-map + +# +# Use GroupMap to map a file full of addresses to the +# same employer +# +# GroupMap sample-config/illuminati The Illuminati +# diff --git a/gitdm/linetags b/gitdm/linetags new file mode 100755 index 0000000..2051b57 --- /dev/null +++ b/gitdm/linetags @@ -0,0 +1,85 @@ +#!/usr/bin/python +# +# Find out how many lines were introduced in each major release. +# +# linetags +# +# This code is part of the LWN git data miner. +# +# Copyright 2007-11 Eklektix, Inc. +# Copyright 2007-11 Jonathan Corbet +# +# This file may be distributed under the terms of the GNU General +# Public License, version 2. +# +import sys, re, os, pickle + +CommitLines = { } + +commitpat = re.compile(r'^([\da-f][\da-f]+) ') + +def GetCommitLines(file): + print file + blame = os.popen('git blame -p ' + file, 'r') + for line in blame.readlines(): + m = commitpat.search(line) + # + # All-zero commits mean we got fed a file that git doesn't + # know about. We could throw an exception and abort processing + # now, or we can just silently ignore it... + # + if not m or m.group(1) == '0000000000000000000000000000000000000000': + continue + try: + CommitLines[m.group(1)] += 1 + except KeyError: + CommitLines[m.group(1)] = 1 + blame.close() + +# +# Try to figure out which tag is the first to contain each commit. +# +refpat = re.compile(r'^(v2\.6\.\d\d).*$') +def CommitToTag(commit): + try: + return DB[commit] + except KeyError: + print 'Missing commit %s' % (commit) + return 'WTF?' + +TagLines = { } +def MapCommits(): + print 'Mapping tags...' + for commit in CommitLines.keys(): + tag = CommitToTag(commit) + try: + TagLines[tag] += CommitLines[commit] + except KeyError: + TagLines[tag] = CommitLines[commit] + +# +# Here we just plow through all the files. +# +if len(sys.argv) != 2: + sys.stderr.write('Usage: linetags directory\n') + sys.exit(1) +# +# Grab the tags/version database. +# +dbf = open('committags.db', 'r') +DB = pickle.load(dbf) +dbf.close() + +out = open('linetags.out', 'w') +os.chdir(sys.argv[1]) +files = os.popen('/usr/bin/find . -type f', 'r') +for file in files.readlines(): + if file.find('.git/') < 0: + GetCommitLines(file[:-1]) +MapCommits() +# print TagLines +tags = TagLines.keys() +tags.sort() +for tag in tags: + out.write('%s %d\n' % (tag, TagLines[tag])) +out.close() diff --git a/gitdm/mahara.aliases b/gitdm/mahara.aliases new file mode 100644 index 0000000..d9b3965 --- /dev/null +++ b/gitdm/mahara.aliases @@ -0,0 +1,4 @@ +richardm@mahara.org richard.mansfield@catalyst.net.nz +richardm@catalyst.net.nz richard.mansfield@catalyst.net.nz +alan@eos.wgtn.cat-it.co.nz alan.mcnatty@catalyst.net.nz +evangoldenberg@gmail.com evang@catalyst.net.nz diff --git a/gitdm/mahara.config b/gitdm/mahara.config new file mode 100644 index 0000000..c3dc185 --- /dev/null +++ b/gitdm/mahara.config @@ -0,0 +1,22 @@ +# +# This is a sample gitdm configuration file. +# + +# +# EmailAliases lets us cope with developers who use more +# than one address. +# +EmailAliases mahara.aliases + +# +# EmailMap does the main work of mapping addresses onto +# employers. +# +EmailMap mahara.domain-map + +# +# Use GroupMap to map a file full of addresses to the +# same employer +# +# GroupMap sample-config/illuminati The Illuminati +# diff --git a/gitdm/mahara.domain-map b/gitdm/mahara.domain-map new file mode 100644 index 0000000..7d5e72d --- /dev/null +++ b/gitdm/mahara.domain-map @@ -0,0 +1,13 @@ +# +# Here is a set of mappings of domain names onto employer names. +# +catalyst.net.nz Catalyst IT +catalyst-eu.net Catalyst IT +luns.net.uk Lancaster University Network Services +ucsf.edu University of California, San Francisco +liip.ch Liip +mjollnir.org Liip +discendum.com Discendum +business.aau.dk Aalborg University +mondragon.edu Mondragon University +majen.net Virtual Learning Academy Charter School diff --git a/gitdm/patterns.py b/gitdm/patterns.py new file mode 100644 index 0000000..e63efb6 --- /dev/null +++ b/gitdm/patterns.py @@ -0,0 +1,39 @@ +# +# Pull together regular expressions used in multiple places. +# +# This code is part of the LWN git data miner. +# +# Copyright 2007-11 Eklektix, Inc. +# Copyright 2007-11 Jonathan Corbet +# +# This file may be distributed under the terms of the GNU General +# Public License, version 2. +# +import re + +# +# Some people, when confronted with a problem, think "I know, I'll use regular +# expressions." Now they have two problems. +# -- Jamie Zawinski +# +Pemail = r'\s+"?([^<"]+)"?\s<([^>]+)>' # just email addr + name +Pcommit = re.compile (r'^commit ([0-9a-f ]+)$') +Pauthor = re.compile (r'^Author:' + Pemail + '$') +Psob = re.compile (r'^\s+Signed-off-by:' + Pemail + '.*$') +Pmerge = re.compile (r'^Merge:.*$') +Padd = re.compile (r'^\+[^+].*$') +Prem = re.compile (r'^-[^-].*$') +Pdate = re.compile (r'^(Commit)?Date:\s+(.*)$') +Pfilea = re.compile (r'^---\s+(.*)$') +Pfileb = re.compile (r'^\+\+\+\s+(.*)$') +Preview = re.compile (r'^\s+Reviewed-by:' + Pemail + '.*$') +Ptest = re.compile (r'^\s+tested-by:' + Pemail + '.*$', re.I) +Prep = re.compile (r'^\s+Reported-by:' + Pemail + '.*$') +Preptest = re.compile (r'^\s+reported-and-tested-by:' + Pemail + '.*$', re.I) +# +# Merges are described with a variety of lines. +# +PExtMerge = re.compile(r'^ +Merge( branch .* of)? ([^ ]+:[^ ]+)\n$') +PIntMerge = re.compile(r'^ +(Merge|Pull) .* into .*$') +# PIntMerge2 = re.compile(r"^ +Merge branch(es)? '.*$") +PIntMerge2 = re.compile(r"^ +Merge .*$") diff --git a/gitdm/reports.py b/gitdm/reports.py new file mode 100644 index 0000000..268fe0a --- /dev/null +++ b/gitdm/reports.py @@ -0,0 +1,343 @@ +# +# A new home for the reporting code. +# +# This code is part of the LWN git data miner. +# +# Copyright 2007-11 Eklektix, Inc. +# Copyright 2007-11 Jonathan Corbet +# +# This file may be distributed under the terms of the GNU General +# Public License, version 2. +# + +import sys + +Outfile = sys.stdout +HTMLfile = None +ListCount = 999999 + + +def SetOutput (file): + global Outfile + Outfile = file + +def SetHTMLOutput (file): + global HTMLfile + HTMLfile = file + +def SetMaxList (max): + global ListCount + ListCount = max + + +def Write (stuff): + Outfile.write (stuff) + + + +# +# HTML output support stuff. +# +HTMLclass = 0 +HClasses = ['Even', 'Odd'] + +THead = '''

+ + +''' + +def BeginReport (title): + global HTMLclass + + Outfile.write ('\n%s\n' % title) + if HTMLfile: + HTMLfile.write (THead % title) + HTMLclass = 0 + +TRow = ''' + +''' + +def ReportLine (text, count, pct): + global HTMLclass + if count == 0: + return + Outfile.write ('%-25s %4d (%.1f%%)\n' % (text, count, pct)) + if HTMLfile: + HTMLfile.write (TRow % (HClasses[HTMLclass], text, count, pct)) + HTMLclass ^= 1 + +def EndReport (): + if HTMLfile: + HTMLfile.write ('
%s
%s%d%.1f%%
\n\n') + +# +# Comparison and report generation functions. +# +def ComparePCount (h1, h2): + return len (h2.patches) - len (h1.patches) + +def ReportByPCount (hlist, cscount): + hlist.sort (ComparePCount) + count = 0 + BeginReport ('Developers with the most changesets') + for h in hlist: + pcount = len (h.patches) + changed = max(h.added, h.removed) + delta = h.added - h.removed + if pcount > 0: + ReportLine (h.name, pcount, (pcount*100.0)/cscount) + count += 1 + if count >= ListCount: + break + EndReport () + +def CompareLChanged (h1, h2): + return max(h2.added, h2.removed) - max(h1.added, h1.removed) + +def ReportByLChanged (hlist, totalchanged): + hlist.sort (CompareLChanged) + count = 0 + BeginReport ('Developers with the most changed lines') + for h in hlist: + pcount = len (h.patches) + changed = max(h.added, h.removed) + delta = h.added - h.removed + if (h.added + h.removed) > 0: + ReportLine (h.name, changed, (changed*100.0)/totalchanged) + count += 1 + if count >= ListCount: + break + EndReport () + +def CompareLRemoved (h1, h2): + return (h2.removed - h2.added) - (h1.removed - h1.added) + +def ReportByLRemoved (hlist, totalremoved): + hlist.sort (CompareLRemoved) + count = 0 + BeginReport ('Developers with the most lines removed') + for h in hlist: + pcount = len (h.patches) + changed = max(h.added, h.removed) + delta = h.added - h.removed + if delta < 0: + ReportLine (h.name, -delta, (-delta*100.0)/totalremoved) + count += 1 + if count >= ListCount: + break + EndReport () + +def CompareEPCount (e1, e2): + return e2.count - e1.count + +def ReportByPCEmpl (elist, cscount): + elist.sort (CompareEPCount) + count = 0 + BeginReport ('Top changeset contributors by employer') + for e in elist: + if e.count != 0: + ReportLine (e.name, e.count, (e.count*100.0)/cscount) + count += 1 + if count >= ListCount: + break + EndReport () + + +def CompareELChanged (e1, e2): + return e2.changed - e1.changed + +def ReportByELChanged (elist, totalchanged): + elist.sort (CompareELChanged) + count = 0 + BeginReport ('Top lines changed by employer') + for e in elist: + if e.changed != 0: + ReportLine (e.name, e.changed, (e.changed*100.0)/totalchanged) + count += 1 + if count >= ListCount: + break + EndReport () + + + +def CompareSOBs (h1, h2): + return len (h2.signoffs) - len (h1.signoffs) + +def ReportBySOBs (hlist): + hlist.sort (CompareSOBs) + totalsobs = 0 + for h in hlist: + totalsobs += len (h.signoffs) + count = 0 + BeginReport ('Developers with the most signoffs (total %d)' % totalsobs) + for h in hlist: + scount = len (h.signoffs) + if scount > 0: + ReportLine (h.name, scount, (scount*100.0)/totalsobs) + count += 1 + if count >= ListCount: + break + EndReport () + +# +# Reviewer reporting. +# +def CompareRevs (h1, h2): + return len (h2.reviews) - len (h1.reviews) + +def ReportByRevs (hlist): + hlist.sort (CompareRevs) + totalrevs = 0 + for h in hlist: + totalrevs += len (h.reviews) + count = 0 + BeginReport ('Developers with the most reviews (total %d)' % totalrevs) + for h in hlist: + scount = len (h.reviews) + if scount > 0: + ReportLine (h.name, scount, (scount*100.0)/totalrevs) + count += 1 + if count >= ListCount: + break + EndReport () + +# +# tester reporting. +# +def CompareTests (h1, h2): + return len (h2.tested) - len (h1.tested) + +def ReportByTests (hlist): + hlist.sort (CompareTests) + totaltests = 0 + for h in hlist: + totaltests += len (h.tested) + count = 0 + BeginReport ('Developers with the most test credits (total %d)' % totaltests) + for h in hlist: + scount = len (h.tested) + if scount > 0: + ReportLine (h.name, scount, (scount*100.0)/totaltests) + count += 1 + if count >= ListCount: + break + EndReport () + +def CompareTestCred (h1, h2): + return h2.testcred - h1.testcred + +def ReportByTestCreds (hlist): + hlist.sort (CompareTestCred) + totaltests = 0 + for h in hlist: + totaltests += h.testcred + count = 0 + BeginReport ('Developers who gave the most tested-by credits (total %d)' % totaltests) + for h in hlist: + if h.testcred > 0: + ReportLine (h.name, h.testcred, (h.testcred*100.0)/totaltests) + count += 1 + if count >= ListCount: + break + EndReport () + + + +# +# Reporter reporting. +# +def CompareReports (h1, h2): + return len (h2.reports) - len (h1.reports) + +def ReportByReports (hlist): + hlist.sort (CompareReports) + totalreps = 0 + for h in hlist: + totalreps += len (h.reports) + count = 0 + BeginReport ('Developers with the most report credits (total %d)' % totalreps) + for h in hlist: + scount = len (h.reports) + if scount > 0: + ReportLine (h.name, scount, (scount*100.0)/totalreps) + count += 1 + if count >= ListCount: + break + EndReport () + +def CompareRepCred (h1, h2): + return h2.repcred - h1.repcred + +def ReportByRepCreds (hlist): + hlist.sort (CompareRepCred) + totalreps = 0 + for h in hlist: + totalreps += h.repcred + count = 0 + BeginReport ('Developers who gave the most report credits (total %d)' % totalreps) + for h in hlist: + if h.repcred > 0: + ReportLine (h.name, h.repcred, (h.repcred*100.0)/totalreps) + count += 1 + if count >= ListCount: + break + EndReport () + + + +def CompareESOBs (e1, e2): + return e2.sobs - e1.sobs + +def ReportByESOBs (elist): + elist.sort (CompareESOBs) + totalsobs = 0 + for e in elist: + totalsobs += e.sobs + count = 0 + BeginReport ('Employers with the most signoffs (total %d)' % totalsobs) + for e in elist: + if e.sobs > 0: + ReportLine (e.name, e.sobs, (e.sobs*100.0)/totalsobs) + count += 1 + if count >= ListCount: + break + EndReport () + +def CompareHackers (e1, e2): + return len (e2.hackers) - len (e1.hackers) + +def ReportByEHackers (elist): + elist.sort (CompareHackers) + totalhackers = 0 + for e in elist: + totalhackers += len (e.hackers) + count = 0 + BeginReport ('Employers with the most hackers (total %d)' % totalhackers) + for e in elist: + nhackers = len (e.hackers) + if nhackers > 0: + ReportLine (e.name, nhackers, (nhackers*100.0)/totalhackers) + count += 1 + if count >= ListCount: + break + EndReport () + + +def DevReports (hlist, totalchanged, cscount, totalremoved): + ReportByPCount (hlist, cscount) + ReportByLChanged (hlist, totalchanged) + ReportByLRemoved (hlist, totalremoved) + ReportBySOBs (hlist) + ReportByRevs (hlist) + ReportByTests (hlist) + ReportByTestCreds (hlist) + ReportByReports (hlist) + ReportByRepCreds (hlist) + +def EmplReports (elist, totalchanged, cscount): + ReportByPCEmpl (elist, cscount) + ReportByELChanged (elist, totalchanged) + ReportByESOBs (elist) + ReportByEHackers (elist) + diff --git a/gitdm/treeplot b/gitdm/treeplot new file mode 100755 index 0000000..3e128be --- /dev/null +++ b/gitdm/treeplot @@ -0,0 +1,333 @@ +#!/usr/bin/python +# +# Create a graph of patch flow into the mainline. +# +# This code is part of the LWN git data miner. +# +# Copyright 2007-11 Eklektix, Inc. +# Copyright 2007-11 Jonathan Corbet +# +# This file may be distributed under the terms of the GNU General +# Public License, version 2. +# +import sys +import patterns + +# +# The various types of commit we understand. +# +class Commit: + def __init__(self, id, parent): + self.id = id + self.parent = parent + self.ismerge = 0 + self.treepriority = 0 +# +# Merges are special +# +class Merge (Commit): + def __init__(self, id, parent): + Commit.__init__(self, id, parent) + self.ismerge = 1 + self.internal = 1 # Two branches within a repo? + self.parents = [ parent ] + + def addparent(self, parentid): + self.parents.append(parentid) + + def addtree(self, tree): + self.tree = tree + self.internal = 0 + +# +# Trees: where the commits come from. +# +class Tree: + def __init__(self, name, url): + self.name = name + self.url = url + self.inputs = [ ] + self.commits = [ ] + + def addcommit(self, id): + self.commits.append(id) + + def addinput(self, tree): + if tree not in self.inputs: + self.inputs.append(tree) + # print '%s -> %s' % (tree.name, self.name) + +Mainline = Tree('Mainline', + 'git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git') +KnownTrees = { Mainline.url: Mainline } + +def NormalizeURL(url): + if url[:4] == 'git:': + return url + if url == '../net-2.6/': + url = 'git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6' + url = url.replace('master.kernel.org:', 'git://git.kernel.org') + if url[-18:] == 'torvalds/linux-2.6': + url += '.git' + if url[:8] == '/pub/scm': + url = 'git://git.kernel.org' + url + return url + +def LookupTree(url): + url = NormalizeURL(url) + try: + return KnownTrees[url] + except KeyError: + tree = Tree(url, url) + KnownTrees[url] = tree + return tree + +# +# We track which tree every commit belongs to. +# +CommitTrees = { } +class CTEntry: + def __init__ (self, tree, priority, path): + self.tree = tree + self.priority = priority + self.path = path + +def AddCommitTree(id, entry): +# print 'add: ', id, '[', +# for tree in entry.path: +# print tree.name, +# print ']' + try: + oldentry = CommitTrees[id] + if entry.priority < oldentry.priority: + CommitTrees[id] = entry + except KeyError: + CommitTrees[id] = entry + + +def LookupCommitTree(id): + try: + return CommitTrees[id] + except KeyError: + print 'Unfound commit %s' % (id) + return CTEntry (Mainline, 0, []) + +# +# Input handling with one-line pushback. +# +SavedLine = None +Input = sys.stdin + +def GetLine(): + global SavedLine + if SavedLine: + ret = SavedLine + SavedLine = None + return ret + return Input.readline() + +def SaveLine(line): + global SavedLine + SavedLine = line + +# +# Pull in a commit and see what it is. +# +def GetCommit(): + # + # Skip junk up to the next commit. + # + while 1: + line = GetLine() + if not line: + return None + m = patterns.Pcommit.match(line) + if m: + break + + # + # Look at the commit and see how many parents we have. + # + ids = m.group(1).split() + if len(ids) <= 1: + if len(CommitTrees.values()) > 0: + print 'No-Parent commit:', ids[0] + return GetCommit() + print 'Did you run git with --parents?' + print ids + sys.exit(1) + if len(ids) == 2: # Simple commit + return Commit(ids[0], ids[1]) + # + # OK, we have a merge. + # + merge = Merge(ids[0], ids[1]) + for id in ids[2:]: + merge.addparent(id) + # + # We need to figure out what kind of merge it is, so read through the + # descriptive text to the merge line. + # + while 1: + line = GetLine() + if not line: + print 'EOF looking for merge line' + return None + # + # Maybe it's an external merge? + # + m = patterns.PExtMerge.match(line) + if m: + merge.addtree(LookupTree(m.group(2))) + return merge + # + # OK, maybe it's internal + # + if patterns.PIntMerge.match(line) or patterns.PIntMerge2.match(line): + #print 'Internal:', line[:-1] + merge.internal = 1 + return merge + m = patterns.Pcommit.match(line) + if m: + print 'Hit next commit (%s) looking for merge line' % (m.group(1)) + SaveLine(line) + return GetCommit() + +# +# Print out a tree and its inputs +# +def PrintTree(tree, indent = ''): + print '%s%4d %s' % (indent, len(tree.commits), tree.name) + for input in tree.inputs: + PrintTree(input, indent + ' ') + +# +# Let's try to build a data structure giving the patch flows. +# +class FlowNode: + def __init__(self, tree): + self.tree = tree + self.inputs = { } + self.commits = 0 + +def BuildFlowTree(): + rootnode = FlowNode(Mainline) + notree = Tree('[No tree]', '') + for centry in CommitTrees.values(): + path = centry.path + if not path: + path = [ notree ] + FillFlowPath(path, rootnode) + return rootnode + +def FillFlowPath(path, node): + node.commits += 1 + if len(path) == 0: + return + next, rest = path[0], path[1:] + try: + nextnode = node.inputs[next.name] + except KeyError: + nextnode = node.inputs[next.name] = FlowNode(next) + return FillFlowPath(rest, nextnode) + +def PrintFlowTree(ftree, indent = ''): + print '%s%3d %s' % (indent, ftree.commits, ftree.tree.name) + inputs = ftree.inputs.values() + inputs.sort(GVSort) + for input in inputs: + PrintFlowTree(input, indent + ' ') + +# +# Something for graphviz +# +GVHeader = '''digraph "runtree" { +graph [ label = "Patch flow into the mainline", + concentrate = true, + nodesep = 0.1, + rankdir = LR ]; +node [shape = polygon, + sides = 4, + height = 0.3 + fontsize = 8]; +''' + + +MainlineCommits = 0 + +def GVTree(ftree): + global MainlineCommits + MainlineCommits = ftree.commits + gvf = open('runtree.gv', 'w') + gvf.write(GVHeader) + inputs = ftree.inputs.values() + inputs.sort(GVSort) + for input in inputs: + GVPrintNode(gvf, input, 'Mainline') + gvf.write('}\n') + +def GVNodeName(treename): + sname = treename.split('/') + if treename.find('kernel.org') >= 0: + return '%s/%s' % (sname[-2], sname[-1]) + sep = treename.find ('://') + if sep > 0: + return treename[sep+3:] + return treename + +def GVSort(n1, n2): + return n2.commits - n1.commits + +def GVPrintNode(gvf, node, parent): + name = GVNodeName(node.tree.name) + gvf.write ('"%s" -> "%s" [taillabel = "%d", labelfontsize = 8' % (name, parent, node.commits)) + gvf.write (', arrowsize = 0.5') + if MainlineCommits/node.commits < 20: + gvf.write(', color = red') + elif MainlineCommits/node.commits < 100: + gvf.write(', color = orange'); + gvf.write(']\n') + inputs = node.inputs.values() + if inputs: + inputs.sort(GVSort) + for input in inputs: + GVPrintNode(gvf, input, name) + +# +# Main code. +# +commit = GetCommit() +ncommits = 0 +while commit: + ncommits += 1 + entry = LookupCommitTree(commit.id) + tree = entry.tree + priority = entry.priority + tree.addcommit(commit.id) + # + # For regular commits, just remember the tree involved + # + if not commit.ismerge: + AddCommitTree(commit.parent, entry) + # + # For merges we have to deal with all the parents. + # + else: + AddCommitTree(commit.parents[0], CTEntry (tree, priority, entry.path)) + if commit.internal: + for p in commit.parents[1:]: + path = entry.path + [tree] + AddCommitTree(p, CTEntry (tree, priority, entry.path)) + else: + for p in commit.parents[1:]: + path = entry.path + [commit.tree] + AddCommitTree(p, CTEntry (commit.tree, priority + 1, path)) + if commit.tree is not Mainline: + tree.addinput(commit.tree) + commit = GetCommit() + +#PrintTree(Mainline) +ftree = BuildFlowTree() +PrintFlowTree(ftree) +GVTree(ftree) +print '%d commits total, %d trees' % (MainlineCommits, len (KnownTrees.keys())) -- GitLab