bin/gnome-import

#! /usr/bin/python
# 
# This file is part of the Lampadas Documentation System.
# 
# Copyright (c) 2000, 2001, 2002 David Merrill <david@lupercalia.net>.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
from Globals import *
import sys
import os
import fnmatch
import fileinput    # FIXME: I think this can go.
import re
import string

from CoreDM import dms

# Set to 1 to get debugging messages, 2 to get more
DEBUG = 0

# These tags don't contain text, so we just mark them
# in the tag stack, but don't extract anything from them.
IGNORE_TAGS = ('resource',
               'subject',
               'person')

# List all GNOME tags here, whether in English or not.
# During import, we convert them all to lowercase English,
# which is what we use for the topic_code.
# 
# Note that these values have to be synched up with the contents
# of the topic and topic_i18n tables.
# 
# FIXME: Some of these are guesses. I only speak English.
GNOME_CATEGORIES = {'gnome':                                    'gnome',
                    'gnome|applets|utility':                    'utility-applets',
                    'gnome|applets|monitors':                   'monitor-applets',
                    'gnome|apliques|utilitarios':               'utility-applets',
                    'gnome|applets|amusement':                  'amusement-applets',
                    'gnome|appletter|underholdning':            'amusement-applets',
                    'gnome|appleter|underholdning':             'amusement-applets',
                    'gnome|applets|clock':                      'clock-applets',
                    'gnome|appletter|ure':                      'clock-applets',
                    'gnome|applets|multimedia':                 'multimedia-applets',
                    'gnome|applets|network':                    'network-applets',
                    'gnome|appletter|netv脱rk':                 'network-applets',
                    'gnome|appleter|nettverk':                  'network-applets',
                    'gnome|appletter|v脱rkt淡jer':              'utility-applets',
                    'gnome|appleter|verkt淡y':                  'utility-applets',
                    'gnome|appletek|seg辿gprogramok':           'utility-applets',
                    'gnome|coredesktop':                        'coredesktop',
                    'gnome|scrivebord':                         'coredesktop',
                    'gnome|skrivebord':                         'coredesktop',
                    'gnome|system':                             'system',
                    'gnome|utilities':                          'utilities',
                    'gnome|utilidades':                         'utilities',
                    'gnome|n炭cleodelescritorio':               'utilities',
                    'gnome|utilitateak':                        'utilities',
                    'gnome|multimedia':                         'multimedia',
                    'gnome|促続促蔵促誕ヂ溝ヂ�ッ促誕ッ':       'utilities',
                    'gnome|games':                              'games',
                    'gnome|spil':                               'games',
                    'gnome|juegos':                             'games',
                    'gnome|werkzeuge':                          'games',
                    'gnome|graphics':                           'gnome-graphics',
                    'applications':                             'applications',
                    'applikasjoner':                            'applications',
                    'applications|utilities|terminals':         'terminals',
                    'applications|applets|network':             'network-applets',
                    'applikasjoner|x|skrivebord|gnome':         'coredesktop',
                    'applications|multimedia|graphics|other':   'graphics',
                    'general':                                  'misc',
                    'general|licenses':                         'licenses',
                    'general|linux':                            'misc_linux',
                   }


class Person:

    def __init__(self):
        self.firstname = ''
        self.middlename = ''
        self.lastname = ''
        self.email =''

class OMF:

    def __init__(self):
        self.title = ''
        self.categories = []
        self.creators = []
        self.maintainers = []
        self.contributors = []
        self.mime = ''
        self.language = ''
        self.url = ''
        self.description = ''
        self.type = ''
        self.date= ''
        self.version= ''
        self.version_date= ''
        self.version_description= ''
        self.seriesid=''
        self.license = ''
        self.license_version = ''
        self.copyright_holder = ''

        self.tags = []

    def parse_xml(self, xml):
        
        temp = xml

        # Compress out whitespace around elements
        temp = temp.replace('\n', '')
        p = re.compile('>\s+')
        temp = p.sub('>', temp)
        p = re.compile('\s+<')
        temp = p.sub('<', temp)

        # Throw away <omf></omf> and everything outside.
        p = re.compile('.*<omf>')
        temp = p.sub('', temp)
        p = re.compile('<\/omf>.*')
        temp = p.sub('', temp)

        # Throw away comments
        p = re.compile('<!--.*?-->')
        temp = p.sub('', temp)

        self.parse_tags(temp)
        
    def parse_tags(self, xml):
        tag, elements, contents, outside = self.parse_next_tag(xml)
        self.tags = self.tags + [tag]

        # Run through tags recursively. Process any contents
        # right away, but outside stuff is caught at the
        # end.
        if tag in IGNORE_TAGS:
            if contents > '':
                self.parse_tags(contents)
        elif tag=='title':
            self.title = contents
        elif tag=='category':
            self.categories = self.categories + [contents]
        elif tag=='identifier':
            self.url = elements['url']
        elif tag=='format':
            self.mime = elements['mime']
        elif tag=='language':
            self.language = elements['code']
        elif tag=='creator':
            creator = Person()
            self.creators = self.creators + [creator]
            self.parse_tags(contents)
        elif tag=='maintainer':
            maintainer = Person()
            self.maintainers = self.maintainers + [maintainer]
            self.parse_tags(contents)
        elif tag=='contributor':
            contributor = Person()
            self.contributors = self.contributors + [contributor]
            self.parse_tags(contents)
        elif tag=='firstname':
            person = self.get_last_person()
            person.firstname = contents
        elif tag=='lastname':
            person = self.get_last_person()
            person.lastname = contents
        elif tag=='email':
            person = self.get_last_person()
            person.email = contents
        elif tag=='description':
            self.description = contents
        elif tag=='type':
            self.type = contents
        elif tag=='date':
            self.date = contents
        elif tag=='version':
            self.version = elements['identifier']
            self.version_date = elements['date']
            self.version_description = elements['description']
        elif tag=='relation':
            self.seriesid = elements['seriesid']
        elif tag=='rights':
            self.license = elements['type']
            self.license_version = elements['license.version']
            self.copyright_holder = elements['holder']
        elif tag=='#text':
            if self.tags[-2]=='creator':
                person = self.creators[-1]
                person.firstname, person.middlename, person.lastname, person.email = self.parse_person(contents)
            elif self.tags[-2]=='maintainer':
                person = self.maintainers[-1]
                person.firstname, person.middlename, person.lastname, person.email = self.parse_person(contents)
            elif self.tags[-2]=='contributor':
                person = self.contributors[-1]
                person.firstname, person.middlename, person.lastname, person.email = self.parse_person(contents)
            else:
                print 'ERROR: this belongs to what? ' + contents
                sys.exit(1)
        else:
            print 'ERROR: cannot handle tag %s' % tag
            sys.exit(1)

        if outside > '':
            self.parse_tags(outside)

        # Pop this tag back off the stack
        self.tags.pop()
            
    def parse_next_tag(self, xml):
        if xml[0]=='<':
            p = re.compile('<(\w+)\s*.*?>')
            m = p.match(xml)
            tag = m.group(1)
        else:
            return '#text', '', xml, ''
        
        p = re.compile('<' + tag + '\s*(.*?)>(.*?)<\/' + tag + '>(.*)')
        m = p.match(xml)

        if m:
            elements = self.parse_elements(m.group(1))
            contents = m.group(2)
            outside  = m.group(3)
        else:
            # If we didn't match, we have shortcut element closure.
            p = re.compile('<' + tag + '\s*(.*?)\/>(.*)')
            m = p.match(xml)
            if m:
                elements = self.parse_elements(m.group(1))
                contents = ''
                outside  = m.group(2)
            else:
                print 'ERROR: cannot find either a full or a shortcut element in ' + xml
                sys.exit(1)

        # lowercase tag once we're done matching it.
        tag = tag.lower()
        
        if DEBUG >= 2:
            print '------------------------------------'
            print 'tag:         %s' % tag
            print 'elements:    %s' % elements
            print 'contents:    %s' % contents
            print 'outside:     %s' % outside
        return tag, elements, contents, outside
        
    def parse_elements(self, xml):
        name, value, remainder = self.parse_next_element(xml)
        if DEBUG >= 2:
            print 'ELEMENTS: '
            print 'name:      ' + name
            print 'value:     ' + value
            print 'remainder: ' + remainder
        elements = {}
        elements[name] = value
        if remainder > '':
            newelements = self.parse_elements(remainder)
            keys = newelements.keys()
            for key in keys:
                elements[key] = newelements[key]
        return elements
        
    def parse_next_element(self, xml):
        if xml=='':
            return '', '', ''
        p = re.compile('([\w|\.]+)="(.*?)"\s*(.*)')
        m = p.match(xml)
        name = m.group(1)
        value = m.group(2)
        remainder = m.group(3)
        return name, value, remainder

    def parse_person(self, xml):
        """
        Sometimes a <creator>, <maintainer> or <contributor) tag contains only text.
        This parses it to extract firstname, middlename, lastname and email.
        
        Example text: kevin@kevindumpscore.com (Kevin Conder)
        """
        
        if DEBUG >= 2:
            print 'NAME PARSING: ' + xml

        # Find an email address
        p = re.compile('(.*?)([^\s]+@[^\s]+)(.*)')
        m = p.match(xml)
        if m:
            email = m.group(2)
            name = trim(m.group(1) + m.group(3))
            if DEBUG >= 2:
                print 'name: ' + name
        else:
            email = ''
            name = xml

        # Discard parentheses around name
        name = name.replace('(','')
        name = name.replace(')','')
        
        spaces = name.count(' ')
        if spaces==0:
            firstname = name
            middlename = ''
            lastname = ''
        elif spaces==1:
            firstname, lastname = name.split()
            middlename = ''
        elif spaces==2:
            firstname, middlename, lastname = name.split()
        else:
            print 'ERROR: two many names ' + str(spaces + 1) + ' in ' + name
            sys.exit(1)

        if DEBUG >= 2:
            print 'firstname:  ' + firstname
            print 'middlename: ' + middlename
            print 'lastname:   ' + lastname
            print 'email:      ' + email
        return firstname, middlename, lastname, email
        
    def get_last_person(self):
        last_person_tag = ''
        for tag in self.tags:
            if tag=='creator' or tag=='maintainer' or tag=='contributor':
                last_person_tag = tag

        if last_person_tag=='creator':
            person = self.creators[-1]
        elif last_person_tag=='maintainer':
            person = self.maintainers[-1]
        elif last_person_tag=='contributor':
            person = self.contributors[-1]
        else:
            print 'ERROR: this belongs to who? ' + self.tags[-1]
            sys.exit(1)
        return person

    def print_debug(self):
        print 'title:           %s' % self.title
        for category in self.categories:
            print 'category:            %s' % category
        for creator in self.creators:
            print 'creator:             %s, %s, %s' % (creator.firstname, creator.lastname, creator.email)
        for maintainer in self.maintainers:
            print 'maintainer:          %s, %s, %s' % (maintainer.firstname, maintainer.lastname, maintainer.email)
        for contributor in self.contributors:
            print 'contributor:         %s, %s, %s' % (contributor.firstname, contributor.lastname, contributor.email)
        print 'mime:                %s' % self.mime
        print 'language:            %s' % self.language
        print 'url:                 %s' % self.url
        print 'description:         %s' % self.description
        print 'type:                %s' % self.type
        print 'date:                %s' % self.date
        print 'version:             %s' % self.version
        print 'version_date:        %s' % self.version
        print 'version_description: %s' % self.version
        print 'seriesid:            %s' % self.seriesid
        print 'license:             %s' % self.license
        print 'license_version:     %s' % self.license_version
        print 'copyright_holder:    %s' % self.copyright_holder
        
def callback(arg, directory, files):
    for file in files:
        if fnmatch.fnmatch(file, arg):
            if DEBUG >= 1:
                print '===================================='
                print 'Processing %s/%s' % (directory, file)
                
            absfilename = os.path.abspath(os.path.join(directory, file))
            
            # Convert from ISO-8859-15 (Gnome default) into UTF-8
            command = 'iconv -f ISO-8859-15 -t UTF-8 ' + absfilename
            fh = os.popen(command)
            xml = fh.read()
            fh.close()

            # Have an OMF object parse the XML.
            omf = OMF()
            omf.parse_xml(xml)

            # Modify OMF to suit Lampadas
            omf.language = trim(omf.language.upper())
            if omf.language=='C':
                omf.language = 'EN'
            omf.language = omf.language[:2]
            
            if omf.license=='GNU FDL':
                omf.license = 'gfdl'
                
            if omf.url > '':
                if omf.url[:7]<>'file://':
                    omf.url = 'file://%s/%s' % (directory, omf.url)
                    
            if omf.type=='manual':
                omf.type = 'userguide'
            elif omf.type=='user\'s guide':
                omf.type = 'userguide'

            if omf.seriesid=='':
                omf.seriesid = new_sk_seriesid()

            if DEBUG >= 1:
                omf.print_debug()

            doc = dms.document.new()
            doc.title            = omf.title
            doc.version          = omf.version
            doc.pub_status_code  = 'N'
            doc.pub_date         = omf.version_date
            doc.license          = omf.license
            doc.license_version  = omf.version
            doc.copyright_holder = omf.copyright_holder
            doc.abstract         = omf.description
            doc.language         = omf.language
            doc.sk_seriesid      = omf.seriesid
            doc.replaced_by_id   = 0
            doc.save()

            # Add document file
            if omf.url > '':
                docfile          = dms.document_file.new()
                docfile.doc_id   = doc.id
                docfile.filename = omf.url
                docfile.top      = 1

                # Make sure there's a file to refer to, before saving!
                sourcefile = docfile.sourcefile
                if sourcefile==None:
                    sourcefile = dms.sourcefile.new()
                    sourcefile.filename = docfile.filename
                    sourcefile.save()

                doc.files.add(docfile)

            # Add document topics
            for category in omf.categories:
                category = category.lower()
                category = category.replace(' ', '')
                
                # Gnome OMF files specify the category in a specific language.
                # Here, we collate them into the English topic_code,
                # because Lampadas considers translations to be the same category.
                #print 'Trying document ' + omf.title + ', ' + omf.language + ', topic_code: ' + category
                try:
                    topic_code = GNOME_CATEGORIES[category]
                except KeyError:
                    print 'ERROR: Cannot find category: [' + category + '] in ' + omf.language + ' ' + omf.title
                    continue

                # Double-check the topic_code.
                topic = dms.topic.get_by_id(topic_code)
                if topic==None:
                    print 'ERROR: Gnome category mapped to an invalid topic: ' + topic_code
                    sys.exit(1)

                doctopic            = dms.document_topic.new()
                doctopic.doc_id     = doc.id
                doctopic.topic_code = topic_code
                doc.topics.add(doctopic)

            # Store in the GNOME collection.
            doccoll                 = dms.document_collection.new()
            doccoll.collection_code = 'gnome'
            doccoll.doc_id          = doc.id
            doc.collections.add(doccoll)

def usage():
    print """Usage: gnome-import [FROM-DIR]

    FROM-DIR is a file directory where we start recursively processing
    OMF files.
    """
    sys.exit()


# Options passed on the command line
if len(sys.argv) <> 2:
    usage()
gnome_dir = sys.argv[1]

# Read in the omf files.
print 'Loading all omf files...'
os.path.walk(gnome_dir, callback, '*.omf')