PyPI download statistics

The PyPI (Python Package Index) download statistics are difficult to obtain: no official stats are provided, and they can only be obtained using Google BigQuery (which requires a paid account for extensive use).

Some technics were published for extracting data for a single module (for example here). I’ve managed to extract the data for all modules. In addition, I filtered out downloads performed by the most common mirroring tools (such as bandersnatch). The results are thus more accurate. The entire dataset can be downloaded in CSV format: pymod.csv

It includes per-month download stats for all Python modules (more than 100,000), since may 2016. I will try to maintain the dataset up-to-date.

Here are the download statistics of my Python modules:

Module Donwloads
Owlready owlready
Pymedtermino pymedtermino
Bibreview bibreview
Editobj editobj
Soya3 soya3
Cerealizer cerealizer
ORMithorynque ormithorynque
eClaircie eclaircie

Here is my script. Beware, before running the script, you need to create an account for Google Query, to modify the GOOGLE_APPLICATION_CREDENTIALS variable to the JSON credential file, and to modify the CACHE variable to the desired filename. In addition, the amount of processed queries is limited per-day on Google BigQuery, thus do not expect to produce the whole dataset in a single attempt – you need about 4-5 days for that.

import sys, os, glob, csv, sqlite3, datetime, json
from collections import defaultdict
from io import StringIO

import googleapiclient, googleapiclient.discovery, googleapiclient.errors
import oauth2client.client

GOOGLE_APPLICATION_CREDENTIALS = "/home/jiba/src/My Project-daaf416e4853.json"
CACHE = "/home/jiba/tmp/pymod.csv"


os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APPLICATION_CREDENTIALS
CACHE_DB = CACHE[:-3] + "sqlite3"

def load():
  if not os.path.exists(CACHE): return {}

  f = open(CACHE)
  c = csv.reader(f)
  xxx, *dates, xxx = next(c)
  data = { date : defaultdict(int) for date in dates }

  for row in c:
    module, *nbs, total = row
    for date, nb in zip(dates, nbs):
      data[date][module] = int(nb)
  f.close()

  return data

def save(data):
  projets = sorted({ projet for data_date in data.values() for projet in data_date.keys() })
  dates   = sorted(data.keys())
  table   = []
  for projet in projets:
    nbs = [data[date][projet] for date in dates]
    table.append([projet] + nbs + [sum(nbs)])

  b = StringIO()
  c = csv.writer(b)
  c.writerow (["Module name"] + dates + ["Total"])
  c.writerows(table)

  f = open(os.path.join(CACHE), "w")
  f.write(b.getvalue())
  f.close()

  try:    os.unlink(CACHE_DB)
  except: pass
  db = sqlite3.connect(CACHE_DB)
  sql = db.cursor()

  table = []
  last_date = max(dates)
  for projet in projets:
    nbs = [data[date][projet] for date in dates]
    table.append([projet, data[last_date][projet], sum(nbs)])

  sql.execute("""CREATE TABLE download_stats (module TEXT, last_month INTEGER, total INTEGER)""")
  sql.executemany("""INSERT INTO download_stats VALUES (?, ?, ?)""", table)
  db.commit()

def add_month(data, date, s):
  if '"' in s:
    s = s.split('"')[1]
  data_date = data[date] = defaultdict(int)
  for ligne in csv.reader(StringIO(s), delimiter = "\t"):
    data_date[ligne[0]] = int(ligne[1])


def previous_month(d):
  if d.month == 1: return datetime.date(d.year - 1, 12, 1)
  return datetime.date(d.year, d.month - 1, 1)

def last_month_day(d):
  day = 28
  while True:
    try:
      d2 = datetime.date(d.year, d.month, day)
    except ValueError:
      return d
    day += 1
    d    = d2

def update(data):
  dates   = set(data.keys())
  d       = previous_month(datetime.date.today())
  stop_at = datetime.date(2016, 4, 1)
  while d > stop_at:
    date = "%s_%02i" % (d.year, d.month)
    if not date in dates:
      update_month(data, d)
    d = previous_month(d)

class BigQuery(object):
  def __init__(self):
    self.credentials = oauth2client.client.GoogleCredentials.get_application_default()
    f = open(GOOGLE_APPLICATION_CREDENTIALS, "r")
    credentials_data = json.load(f)
    f.close()
    self.project_id = credentials_data["project_id"]
    self.bigquery   = googleapiclient.discovery.build("bigquery", "v2", credentials = self.credentials)
    self.jobs       = self.bigquery.jobs()

  def execute(self, query):
    response = self.jobs.query(projectId = self.project_id, body = { "query" : query } ).execute()
    print(response)
    print(response["jobComplete"])
    if (not response["jobComplete"]): raise RuntimeError
    return response["rows"]

BIG_QUERY = None

def update_month(data, d):
  global BIG_QUERY
  if BIG_QUERY is None:
    BIG_QUERY = BigQuery()

  print("Querying for month:", d)

  query = """SELECT GROUP_CONCAT(x, '\n') FROM (
SELECT
  CONCAT(file.project, '\t', STRING(count(*))) AS x
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    TIMESTAMP('%s-%02i-01 00:00:00'),
    TIMESTAMP('%s-%02i-%02i 23:59:59')
  )
WHERE (details.installer.name != "bandersnatch") AND (details.installer.name != "z3c.pypimirror")
GROUP BY file.project
)""" % (d.year, d.month, d.year, d.month, last_month_day(d).day)

  print(query)

  rows = BIG_QUERY.execute(query)

  s = rows[0]["f"][0]["v"]

  f = open("/home/jiba/tmp/pymod_%s_%02i.csv" % (d.year, d.month), "w")
  f.write(s)
  f.close()

  add_month(data, "%s_%02i" % (d.year, d.month), s)
  save(data)

data = load()
update(data)
save(data)