Python build an SQLite Relational Database

Discussion about Schedules Direct grabber code and data formats.
Post Reply
kmedcalf
Posts: 7
Joined: Sat Sep 13, 2008 7:01 am

Python build an SQLite Relational Database

Post by kmedcalf »

This code used the ScheduleDirect.py module to parse the XML data and build an SQLite Database from the data.

Code is released into the Public Domain.
TMSWebService.py will retrieve the data from SD,
ScheduleDirect.py parses the XML into a record stream,
LoadSQL.py uses ScheduleDirect.py and the downloaded data to build an SQLite database.

Code: Select all

# This module uses the record stream returned by SchedulesDirect.py to build an SQLite
# relational database.
#
# The code is released into the Public Domain.
# If you break it, you own both halves.
# You are free to use or modify this code.
#
# Original Code by Keith Medcalf, kmedcalf@dessus.com
# Released to SchedulesDirect on 21 January 2012

import codecs
import encodings
import os
import sqlite3
import sys
import time

import ScheduleDirect


class dbFieldRow(sqlite3.Row):
    def __getattr__(self, key):
        return self[key]
    def __repr__(self):
        return 'dbFieldRow(' + ', '.join('%s=%s' % (k, repr(self[k])) for k in self.keys()) + ')'
    def _asdict(self):
        return dict((k, self[k]) for k in self.keys())


LogStamp = lambda : time.strftime('%Y-%m-%d %H:%M:%S')


_dbfilename = 'xmltv.db'


_createTables = """
CREATE TABLE advisory ( advisory        integer primary key,
                        advisoryname    text collate nocase not null default 'Unknown' unique
                      );
create unique index advisoryinverse on advisory (advisoryname, advisory);
create unique index advisoryforward on advisory (advisory, advisoryname);

CREATE TABLE advlink  ( program         text not null collate nocase,
                        advisory        integer not null,
                        primary key (program, advisory)
                      );
create unique index advlinkinverse on advlink (advisory, program);

CREATE TABLE crew     ( crew            integer primary key,
                        surname         text collate nocase not null default '',
                        givenname       text collate nocase not null default '',
                        unique (surname, givenname)
                      );
create unique index crewforward on crew (crew, surname, givenname);
create unique index crewinverse on crew (surname, givenname, crew);

CREATE TABLE crewlink ( program         text not null collate nocase,
                        crew            integer not null,
                        role            integer not null,
                        primary key (program, role, crew)
                      );
create unique index crewbyrole on crewlink (role, crew, program);
create unique index crewbycrew on crewlink (crew, role, program);

CREATE TABLE genre    ( genre           integer primary key,
                        class           text collate nocase not null default 'Unknown' unique
                      );
create unique index genreinverse on genre (class, genre);
create unique index genreforward on genre (genre, class);

CREATE TABLE genrlink ( program         text not null collate nocase,
                        genre           integer not null,
                        relevance       integer not null default 0,
                        unique (program, relevance, genre)
                      );
create unique index genrlinkgenre on genrlink (genre, program, relevance);

CREATE TABLE lineup  (  lineup          text not null collate nocase primary key,
                        name            text not null collate nocase default 'No Name Given',
                        device          text not null collate nocase default 'No Device Given',
                        location        text not null collate nocase default 'No Location Given',
                        postalcode      text not null collate nocase default '',
                        type            text not null collate nocase default ''
                     );

CREATE TABLE map     (  lineup          text not null collate nocase,
                        station         text not null collate nocase,
                        channel         integer not null,
                        validfrom       integer not null,
                        validto         integer not null,
                        primary key (lineup, station, channel, validfrom, validto)
                     );

CREATE TABLE program  ( program         text not null collate nocase primary key,
                        showType        text not null collate nocase default '',
                        series          text collate nocase,
                        title           text collate nocase,
                        subtitle        text collate nocase,
                        description     text not null collate nocase default 'No Description Available',
                        colorCode       text not null collate nocase default 'Color',
                        starRating      text not null collate nocase default '',
                        originalAirDate date,
                        syndicatedEpisodeNumber text collate nocase,
                        year            integer,
                        mpaaRating      text default '' collate nocase,
                        runTime         integer
                      );
CREATE UNIQUE INDEX programPS on program (program, showType);
CREATE UNIQUE INDEX programSP on program (showType, program);

CREATE TABLE role     ( role            integer primary key,
                        rolename        text collate nocase not null default 'Unknown Role' unique
                      );

CREATE TABLE schedule ( time            integer not null,
                        endtime         integer not null,
                        station         text not null collate nocase,
                        program         text not null collate nocase,
                        duration        integer not null,
                        tvRating        text not null default '' collate nocase,
                        stereo          integer not null default 0,
                        ei              integer not null default 0,
                        subtitled       integer not null default 0,
                        hdtv            integer not null default 0,
                        new             integer not null default 0,
                        dolby           integer not null default 0,
                        closeCaptioned  integer not null default 0,
                        part            integer,
                        parts           integer,
                        primary key (time, station)
                      );
CREATE INDEX stationschedule on schedule(station, time, program, endtime, duration);

CREATE TABLE station (  station         text not null collate nocase primary key,
                        callsign        text not null collate nocase,
                        name            text not null collate nocase default 'No Name Given',
                        affiliate       text not null collate nocase default 'Independant',
                        fccChannelNumber integer
                     );
CREATE INDEX stationcallsign on station (callsign, station);
"""


_fixupData = """
update program
   set showType = 'Movie'
 where program like 'MV%'
   and showType = '';

update program
   set showType = 'Sports'
 where program like 'SP%'
   and showType = '';

update program
   set showType = 'Series'
 where (   program like 'EP%'
        or program like 'SH%')
   and showType = '';
"""

_genreFix = """
delete from genrlink
      where exists (select *
                      from program as P natural join genrlink as L natural join genre as G
                     where P.program = genrlink.program
                       and L.relevance = genrlink.relevance
                       and P.showtype = G.class);
update genrlink
   set relevance = 0
 where relevance = 1
   and not exists (select *
                     from genrlink as G
                    where G.program = genrlink.program
                      and G.relevance = 0);
update genrlink
   set relevance = 1
 where relevance = 2
   and not exists (select *
                     from genrlink as G
                    where G.program = genrlink.program
                      and G.relevance = 1);
update genrlink
   set relevance = 2
 where relevance = 3
   and not exists (select *
                     from genrlink as G
                    where G.program = genrlink.program
                      and G.relevance = 2);
update genrlink
   set relevance = 3
 where relevance = 4
   and not exists (select *
                     from genrlink as G
                    where G.program = genrlink.program
                      and G.relevance = 3);
update genrlink
   set relevance = 4
 where relevance = 5
   and not exists (select *
                     from genrlink as G
                    where G.program = genrlink.program
                      and G.relevance = 4);
update genrlink
   set relevance = 5
 where relevance = 6
   and not exists (select *
                     from genrlink as G
                    where G.program = genrlink.program
                      and G.relevance = 5);
"""


counterNames = [('lineup','lineup'),
                ('station', 'station'),
                ('map', 'map'),
                ('schedule', 'schedule'),
                ('program', 'program'),
                ('crew', 'crewlink'),
                ('genre', 'genrlink'),
                ('advisories', 'advlink'),
                ('advisory class', 'advisory'),
                ('Crew Names', 'crew'),
                ('Crew Roles', 'role')]


def buildInsert(name, attr):
    fieldList = sorted(attr.keys())
    return 'insert or replace into %s (%s) values (:%s);' % (name, ', '.join(fieldList), ', :'.join(fieldList))


def MainLine():
    rcnt = 0
    bs = chr(8) * 19
    lastSTime = None
    lastSStat = None
    lasttime = time.time()
    print LogStamp(), 'Opening Database', _dbfilename
    if not os.path.isfile(_dbfilename):
        dbc = sqlite3.connect(_dbfilename)
        dbc.executescript(_createTables)
    else:
        dbc = sqlite3.connect(_dbfilename)
    dbc.row_factory = dbFieldRow
    counts = {}
    for name, table in counterNames:
        counts[name] = [0, 0]
    print LogStamp(), 'Processing XML record:',
    print '%8d: %6.2f%%' % (rcnt, 0.0),
    dbc.execute('BEGIN;')
    dbc.execute('delete from station;')
    dbc.execute('delete from lineup;')
    dbc.execute('delete from map;')
    for name, pct, attr in ScheduleDirect.XMLDataFileParser('ddata.xml'):
        if name in counts:
            counts[name][0] += 1
            rcnt += 1
            if time.time() - lasttime > 1:
                print bs, '%8d: %6.2f%%' % (rcnt, pct),
                lasttime = time.time()
        if name in ['station', 'map', 'lineup']:
            stmt = buildInsert(name, attr)
            dbc.execute(stmt, attr)
        elif name == 'program':
            stmt = buildInsert(name, attr)
# ###       advisories records appear in the parsed record stream before the program
# ###       so we cannot delete the advlink records
# ###       this means there may be non-applicable advisories for a program if the advisory
# ###       data changes
#           dbc.execute('delete from advlink where program=:program;', attr)
            dbc.execute('delete from crewlink where program=:program;', attr)
            dbc.execute('delete from genrlink where program=:program;', attr)
            dbc.execute(stmt, attr)
        elif name == 'schedule':
            stmt = buildInsert(name, attr)
            if not lastSStat or lastSStat != attr['station']:
                lastSStat = attr['station']
                lastSTime = 0
            attr['lastSTime'] = lastSTime
            dbc.execute('delete from schedule where station=:station and time between :lastSTime and :time;', attr)
            dbc.execute(stmt, attr)
            lastSTime = attr['time'] + 1
        elif name == 'crew':
            if 'role' in attr:
                attr['rolename'] = attr['role']
                del attr['role']
            for field in ['surname', 'givenname']:
                if not field in attr:
                    attr[field] = ''
            if not 'rolename' in attr:
                attr['rolename'] = 'Unknown'
            dbc.execute('insert or ignore into role (rolename) values (:rolename);', attr)
            attr['role'] = dbc.execute('select role from role where rolename=:rolename;', attr).fetchone().role
            dbc.execute('insert or ignore into crew (surname, givenname) values (:surname, :givenname);', attr)
            attr['crew'] = dbc.execute('select crew from crew where surname=:surname and givenname=:givenname;', attr).fetchone().crew
            dbc.execute('insert or ignore into crewlink (program, crew, role) values (:program, :crew, :role);', attr)
        elif name == 'genre':
            dbc.execute('insert or ignore into genre (class) values (:class);', attr)
            attr['genre'] = dbc.execute('select genre from genre where class=:class;', attr).fetchone().genre
            dbc.execute('insert or ignore into genrlink (program, relevance, genre) values (:program, :relevance, :genre);', attr)
        elif name == 'advisories':
            if 'advisory' in attr:
                attr['advisoryname'] = attr['advisory']
                del attr['advisory']
            dbc.execute('insert or ignore into advisory (advisoryname) values (:advisoryname);', attr)
            attr['advisory'] = dbc.execute('select advisory from advisory where advisoryname=:advisoryname;', attr).fetchone().advisory
            dbc.execute('insert or ignore into advlink (program, advisory) values (:program, :advisory);', attr)
    print bs, '%8d: %6.2f%%' % (rcnt, 100.0)
    print LogStamp(), 'Cleanup Schedules'
    dbc.execute('delete from schedule where station not in (select station from station);')
    print LogStamp(), 'Set Initial Program ShowType'
    dbc.executescript(_fixupData)
    print LogStamp(), 'Fixup Genre'
    dbc.executescript(_genreFix)
    print LogStamp(), 'Commit Database to Disk'
    dbc.commit()
    print LogStamp(), 'Vacuum Database'
    dbc.execute('vacuum;')
    print LogStamp(), 'Analyze Statistics'
    dbc.execute('analyze;')
    for name, table in counterNames:
        counts[name][1] = dbc.execute('select count(*) as reccount from %s;' % (table,)).fetchone().reccount
    for name, table in counterNames:
        print LogStamp(), '%7d input %-14s records, total %7d in table %s.' % (counts[name][0], name.title(), counts[name][1], table.title())
    dbc.close()
    print LogStamp(), 'Done!'


if __name__ == '__main__':
    MainLine()

kmedcalf
Posts: 7
Joined: Sat Sep 13, 2008 7:01 am

Re: Python build an SQLite Relational Database

Post by kmedcalf »

Fixed Advisories, so uncommented code to ignore them

Code: Select all

# This module uses the record stream returned by SchedulesDirect.py to build an SQLite
# relational database.
#
# The code is released into the Public Domain.
# If you break it, you own both halves.
# You are free to use or modify this code.
#
# Original Code by Keith Medcalf, kmedcalf@dessus.com
# Released to SchedulesDirect on 21 January 2012

import codecs
import encodings
import os
import sqlite3
import sys
import time

import ScheduleDirect


class dbFieldRow(sqlite3.Row):
    def __getattr__(self, key):
        return self[key]
    def __repr__(self):
        return 'dbFieldRow(' + ', '.join('%s=%s' % (k, repr(self[k])) for k in self.keys()) + ')'
    def _asdict(self):
        return dict((k, self[k]) for k in self.keys())


LogStamp = lambda : time.strftime('%Y-%m-%d %H:%M:%S')


_dbfilename = 'xmltv.db'


_createTables = """
CREATE TABLE advisory ( advisory        integer primary key,
                        advisoryname    text collate nocase not null default 'Unknown' unique
                      );
create unique index advisoryinverse on advisory (advisoryname, advisory);
create unique index advisoryforward on advisory (advisory, advisoryname);

CREATE TABLE advlink  ( program         text not null collate nocase,
                        advisory        integer not null,
                        primary key (program, advisory)
                      );
create unique index advlinkinverse on advlink (advisory, program);

CREATE TABLE crew     ( crew            integer primary key,
                        surname         text collate nocase not null default '',
                        givenname       text collate nocase not null default '',
                        unique (surname, givenname)
                      );
create unique index crewforward on crew (crew, surname, givenname);
create unique index crewinverse on crew (surname, givenname, crew);

CREATE TABLE crewlink ( program         text not null collate nocase,
                        crew            integer not null,
                        role            integer not null,
                        primary key (program, role, crew)
                      );
create unique index crewbyrole on crewlink (role, crew, program);
create unique index crewbycrew on crewlink (crew, role, program);

CREATE TABLE genre    ( genre           integer primary key,
                        class           text collate nocase not null default 'Unknown' unique
                      );
create unique index genreinverse on genre (class, genre);
create unique index genreforward on genre (genre, class);

CREATE TABLE genrlink ( program         text not null collate nocase,
                        genre           integer not null,
                        relevance       integer not null default 0,
                        unique (program, relevance, genre)
                      );
create unique index genrlinkgenre on genrlink (genre, program, relevance);

CREATE TABLE lineup  (  lineup          text not null collate nocase primary key,
                        name            text not null collate nocase default 'No Name Given',
                        device          text not null collate nocase default 'No Device Given',
                        location        text not null collate nocase default 'No Location Given',
                        postalcode      text not null collate nocase default '',
                        type            text not null collate nocase default ''
                     );

CREATE TABLE map     (  lineup          text not null collate nocase,
                        station         text not null collate nocase,
                        channel         integer not null,
                        validfrom       integer not null,
                        validto         integer not null,
                        primary key (lineup, station, channel, validfrom, validto)
                     );

CREATE TABLE program  ( program         text not null collate nocase primary key,
                        showType        text not null collate nocase default '',
                        series          text collate nocase,
                        title           text collate nocase,
                        subtitle        text collate nocase,
                        description     text not null collate nocase default 'No Description Available',
                        colorCode       text not null collate nocase default 'Color',
                        starRating      text not null collate nocase default '',
                        originalAirDate date,
                        syndicatedEpisodeNumber text collate nocase,
                        year            integer,
                        mpaaRating      text default '' collate nocase,
                        runTime         integer
                      );
CREATE UNIQUE INDEX programPS on program (program, showType);
CREATE UNIQUE INDEX programSP on program (showType, program);

CREATE TABLE role     ( role            integer primary key,
                        rolename        text collate nocase not null default 'Unknown Role' unique
                      );

CREATE TABLE schedule ( time            integer not null,
                        endtime         integer not null,
                        station         text not null collate nocase,
                        program         text not null collate nocase,
                        duration        integer not null,
                        tvRating        text not null default '' collate nocase,
                        stereo          integer not null default 0,
                        ei              integer not null default 0,
                        subtitled       integer not null default 0,
                        hdtv            integer not null default 0,
                        new             integer not null default 0,
                        dolby           integer not null default 0,
                        closeCaptioned  integer not null default 0,
                        part            integer,
                        parts           integer,
                        primary key (time, station)
                      );
CREATE INDEX stationschedule on schedule(station, time, program, endtime, duration);

CREATE TABLE station (  station         text not null collate nocase primary key,
                        callsign        text not null collate nocase,
                        name            text not null collate nocase default 'No Name Given',
                        affiliate       text not null collate nocase default 'Independant',
                        fccChannelNumber integer
                     );
CREATE INDEX stationcallsign on station (callsign, station);
"""


_fixupData = """
update program
   set showType = 'Movie'
 where program like 'MV%'
   and showType = '';

update program
   set showType = 'Sports'
 where program like 'SP%'
   and showType = '';

update program
   set showType = 'Series'
 where (   program like 'EP%'
        or program like 'SH%')
   and showType = '';
"""

_genreFix = """
delete from genrlink
      where exists (select *
                      from program as P natural join genrlink as L natural join genre as G
                     where P.program = genrlink.program
                       and L.relevance = genrlink.relevance
                       and P.showtype = G.class);
update genrlink
   set relevance = 0
 where relevance = 1
   and not exists (select *
                     from genrlink as G
                    where G.program = genrlink.program
                      and G.relevance = 0);
update genrlink
   set relevance = 1
 where relevance = 2
   and not exists (select *
                     from genrlink as G
                    where G.program = genrlink.program
                      and G.relevance = 1);
update genrlink
   set relevance = 2
 where relevance = 3
   and not exists (select *
                     from genrlink as G
                    where G.program = genrlink.program
                      and G.relevance = 2);
update genrlink
   set relevance = 3
 where relevance = 4
   and not exists (select *
                     from genrlink as G
                    where G.program = genrlink.program
                      and G.relevance = 3);
update genrlink
   set relevance = 4
 where relevance = 5
   and not exists (select *
                     from genrlink as G
                    where G.program = genrlink.program
                      and G.relevance = 4);
update genrlink
   set relevance = 5
 where relevance = 6
   and not exists (select *
                     from genrlink as G
                    where G.program = genrlink.program
                      and G.relevance = 5);
"""


counterNames = [('lineup','lineup'),
                ('station', 'station'),
                ('map', 'map'),
                ('schedule', 'schedule'),
                ('program', 'program'),
                ('crew', 'crewlink'),
                ('genre', 'genrlink'),
                ('advisories', 'advlink'),
                ('advisory class', 'advisory'),
                ('Crew Names', 'crew'),
                ('Crew Roles', 'role')]


def buildInsert(name, attr):
    fieldList = sorted(attr.keys())
    return 'insert or replace into %s (%s) values (:%s);' % (name, ', '.join(fieldList), ', :'.join(fieldList))


def MainLine():
    rcnt = 0
    bs = chr(8) * 19
    lastSTime = None
    lastSStat = None
    lasttime = time.time()
    print LogStamp(), 'Opening Database', _dbfilename
    if not os.path.isfile(_dbfilename):
        dbc = sqlite3.connect(_dbfilename)
        dbc.executescript(_createTables)
    else:
        dbc = sqlite3.connect(_dbfilename)
    dbc.row_factory = dbFieldRow
    counts = {}
    for name, table in counterNames:
        counts[name] = [0, 0]
    print LogStamp(), 'Processing XML record:',
    print '%8d: %6.2f%%' % (rcnt, 0.0),
    dbc.execute('BEGIN;')
    dbc.execute('delete from station;')
    dbc.execute('delete from lineup;')
    dbc.execute('delete from map;')
    for name, pct, attr in ScheduleDirect.XMLDataFileParser('ddata.xml'):
        if name in counts:
            counts[name][0] += 1
            rcnt += 1
            if time.time() - lasttime > 1:
                print bs, '%8d: %6.2f%%' % (rcnt, pct),
                lasttime = time.time()
        if name in ['station', 'map', 'lineup']:
            stmt = buildInsert(name, attr)
            dbc.execute(stmt, attr)
        elif name == 'program':
            stmt = buildInsert(name, attr)
            dbc.execute('delete from advlink where program=:program;', attr)
            dbc.execute('delete from crewlink where program=:program;', attr)
            dbc.execute('delete from genrlink where program=:program;', attr)
            dbc.execute(stmt, attr)
        elif name == 'schedule':
            stmt = buildInsert(name, attr)
            if not lastSStat or lastSStat != attr['station']:
                lastSStat = attr['station']
                lastSTime = 0
            attr['lastSTime'] = lastSTime
            dbc.execute('delete from schedule where station=:station and time between :lastSTime and :time;', attr)
            dbc.execute(stmt, attr)
            lastSTime = attr['time'] + 1
        elif name == 'crew':
            if 'role' in attr:
                attr['rolename'] = attr['role']
                del attr['role']
            for field in ['surname', 'givenname']:
                if not field in attr:
                    attr[field] = ''
            if not 'rolename' in attr:
                attr['rolename'] = 'Unknown'
            dbc.execute('insert or ignore into role (rolename) values (:rolename);', attr)
            attr['role'] = dbc.execute('select role from role where rolename=:rolename;', attr).fetchone().role
            dbc.execute('insert or ignore into crew (surname, givenname) values (:surname, :givenname);', attr)
            attr['crew'] = dbc.execute('select crew from crew where surname=:surname and givenname=:givenname;', attr).fetchone().crew
            dbc.execute('insert or ignore into crewlink (program, crew, role) values (:program, :crew, :role);', attr)
        elif name == 'genre':
            dbc.execute('insert or ignore into genre (class) values (:class);', attr)
            attr['genre'] = dbc.execute('select genre from genre where class=:class;', attr).fetchone().genre
            dbc.execute('insert or ignore into genrlink (program, relevance, genre) values (:program, :relevance, :genre);', attr)
        elif name == 'advisories':
            if 'advisory' in attr:
                attr['advisoryname'] = attr['advisory']
                del attr['advisory']
            dbc.execute('insert or ignore into advisory (advisoryname) values (:advisoryname);', attr)
            attr['advisory'] = dbc.execute('select advisory from advisory where advisoryname=:advisoryname;', attr).fetchone().advisory
            dbc.execute('insert or ignore into advlink (program, advisory) values (:program, :advisory);', attr)
    print bs, '%8d: %6.2f%%' % (rcnt, 100.0)
    print LogStamp(), 'Cleanup Schedules'
    dbc.execute('delete from schedule where station not in (select station from station);')
    print LogStamp(), 'Set Initial Program ShowType'
    dbc.executescript(_fixupData)
    print LogStamp(), 'Fixup Genre'
    dbc.executescript(_genreFix)
    print LogStamp(), 'Commit Database to Disk'
    dbc.commit()
    print LogStamp(), 'Vacuum Database'
    dbc.execute('vacuum;')
    print LogStamp(), 'Analyze Statistics'
    dbc.execute('analyze;')
    for name, table in counterNames:
        counts[name][1] = dbc.execute('select count(*) as reccount from %s;' % (table,)).fetchone().reccount
    for name, table in counterNames:
        print LogStamp(), '%7d input %-14s records, total %7d in table %s.' % (counts[name][0], name.title(), counts[name][1], table.title())
    dbc.close()
    print LogStamp(), 'Done!'


if __name__ == '__main__':
    MainLine()


Post Reply