Uzanto:Maksim/sozd sta

De Wikipedio
Salti al navigilo Irez a serchilo
  • cht_x_sta.py
# -*- coding: UTF-8 -*-

__version__='$Id:'

import sys, codecs, re, os
import wikipedia, date, xmlreader
from bib_tekst import *

def provstop(fstop):
    return os.access(fstop,os.F_OK)

def main():
    fstop='cht_x_sta_stop.txt'
    try:
        os.unlink(fstop)
    except:
        pass

    if provstop(fstop):
        wikipedia.output(u'stop0!\n')
        return

#    fkat = wikipedia.argHandler(sys.argv[1], 'cht_cat')
#    fzap = wikipedia.argHandler(sys.argv[2], 'cht_cat')
#    fotch = wikipedia.argHandler(sys.argv[3], 'cht_cat')
    fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
    fkat = fi+'.txt'
    fzap = fi
    fotch = fi+'_ot.txt'

    flotd=0
    if len(sys.argv)>=3 and sys.argv[2]==u'2':
        flotd=1

    mysite = wikipedia.getSite()

    f=codecs.open(fkat,'r',encoding='utf-8')

    skon=u''
    i=0

    if os.access(fotch,os.F_OK):
        otch = codecs.open(fotch, 'rb', 'utf-8')

        for s in otch.readlines():
            if ord(s[0]) == 65279:
                s=s[1:]
            s=s.replace(u'\r',u'')
            while len(s)>0 and s[len(s)-1]==u'\n':
                s=s[:len(s)-1]
            if (s[0]==u'+' or s[0].isdigit()) and len(s)>=9:
                s1=s[8:]
                skon=s1
                if s[0].isdigit():
                    i=0
                    j=0
                    while j<=5:
                        i=i*10+ord(s[j])-ord(u'0')
                        j+=1
                    i+=1
        otch.close()

    wikipedia.output(u'skon=%s\n i=%d' % (skon,i))

    otch = codecs.open(fotch, 'a', 'utf-8')

    flrab=0

    t1=f.readlines()
    if not flotd:
        if skon==u'':
            vertfile = codecs.open(u'%s_rez.txt' % fzap, 'w', 'utf-8')
        else:
            vertfile = codecs.open(u'%s_rez.txt' % fzap, 'a', 'utf-8')

    j0=0
    while j0<len(t1):
        plm=[]

        j=j0
        while j<len(t1) and j<j0+64:
            n=t1[j]
            if ord(n[0]) == 65279:
                n=n[1:]

            n=ubr_nk_prob(n)
 
            if n.startswith(u'[['):
                n=n[2:]
            if len(n)>2 and n[len(n)-2:]==u']]':
                n=n[:len(n)-2]

            wikipedia.output(u'... "%s" "%s"\n' % (n,skon))
            if skon==u'' or flrab:
                pl = wikipedia.Page(mysite,n)
                plm.append(pl)
            elif n==skon:
                flrab=1

            j+=1

        try:
            wikipedia.getall(mysite, plm)
        except xmlreader.xml.sax._exceptions.SAXParseException:
            wikipedia.getall(mysite, plm)

        for pl in plm:

            try:
                t = pl.get()
                if flotd:
                    vertfile = codecs.open(u'%s%06d.txt' % (fzap,i), 'w', 'utf-8')
                vertfile.write(pl.title() + '\n')
                vertfile.write(t + '\n')
                if flotd:
                    vertfile.close()
                else:
                    vertfile.write('================\n')
                    vertfile.flush()
                b = u'%06d  %s\n' % (i,pl.title())
                wikipedia.output(b)
                otch.write(b)
                otch.flush()
                i=i+1
            except wikipedia.IsRedirectPage:
                b = u'+       %s\n-> %s\n' % (pl.title(),pl.getRedirectTarget())
                wikipedia.output(b)
                otch.write(b)
                otch.flush()
            except wikipedia.NoPage:
                b = u'-       %s\n' % pl.title()
                wikipedia.output(b)
                otch.write(b)
                otch.flush()
        j0=j
        if provstop(fstop):
            wikipedia.output(u'stop!\n')
            return

try:
    main()
finally:
    wikipedia.stopme()
    try:
        excludefile.close()
    except:
        pass

* cht_mn_sta.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import sys, codecs, re
import wikipedia, date, catlib



try:

#    fkat = wikipedia.argHandler(sys.argv[1], 'cht_cat')
#    fzap = wikipedia.argHandler(sys.argv[2], 'cht_cat')
#    fotch = wikipedia.argHandler(sys.argv[3], 'cht_cat')
    fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
    fkat = fi+'.txt'
    fzap = fi
    fotch = fi+'_ot.txt'

    mysite = wikipedia.getSite()

    f=codecs.open(fkat,'r',encoding='utf-8')

    otch = codecs.open(fotch, 'w', 'utf-8')


    t1=f.readlines()
    i=0

    for line in t1:
        if ord(line[0]) == 65279:
            line=line[1:]

        workingcatname = line

        wikipedia.output(workingcatname)

        pl = wikipedia.Page(mysite,wikipedia.UnicodeToAsciiHtml(workingcatname))
        ple=pl.exists()

        if ple: 
            try:
                t = pl.get()
                vertfile = codecs.open(u'%s%06d.txt' % (fzap,i), 'w', 'utf-8')
                vertfile.write(pl.title() + '\n')
                vertfile.write(t + '\n')
                vertfile.flush()
                vertfile.close()
                b = u'%06d  %s\n' % (i,pl.title())
                wikipedia.output(b)
                otch.write(b)
                otch.flush()
                i=i+1
            except wikipedia.IsRedirectPage:
                b = u'+       %s\n-> %s\n' % (pl.title(),pl.getRedirectTarget())
                wikipedia.output(b)
                otch.write(b)
                otch.flush()
        else:
            b = u'-       %s\n' % pl.title()
            wikipedia.output(b)
            otch.write(b)
            otch.flush()

finally:
    wikipedia.stopme()
    try:
        excludefile.close()
    except:
        pass

* cht_mn_cat.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import sys, codecs, re
import wikipedia, date, catlib


try:
    fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
    fkat = fi+u'.txt'
    fzap = fi+u'_rez.txt'


#    fkat = wikipedia.argHandler(sys.argv[1], 'cht_cat')
#    fzap = wikipedia.argHandler(sys.argv[2], 'cht_cat')


    f=codecs.open(fkat,'r',encoding='utf-8')
#    f=open(fkat,'r') #,encoding='utf-8')

    vertfile = codecs.open(fzap, 'w', 'utf-8')

    mysite = wikipedia.getSite()

    for s in f.readlines():
        if ord(s[0]) == 65279:
            s=s[1:]
        s=s.replace(u'\r',u'')
        while len(s)>0 and s[len(s)-1]==u'\n':
            s=s[:len(s)-1]

        wikipedia.output(s)

        workingcat = catlib.Category(mysite,s)

        vertfile.write( u'# %s\n' % s )
        vertfile.flush()

        list = workingcat.articles()
        if list:
            for pl in list:
                vertfile.write( u'%s\n' % pl.title() )
                vertfile.flush()
        list = workingcat.subcategories()
        if list:
            for ka in list:
                vertfile.write( u'%s\n' % ka.title() )
                vertfile.flush()

finally:
    wikipedia.stopme()
    try:
        excludefile.close()
    except:
        pass

* sozd_sta.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys, os
import codecs

def ubr_nk_prob(t):
    i=0
    while i<len(t) and ((t[i]==u' ') or (t[i]==u'\n') or (t[i]==u'\t') or
                        (t[i]==u'\r')):
        i+=1
    j=len(t)-1
    while j>i and ((t[j]==u' ') or (t[j]==u'\n') or (t[j]==u'\t') or
                        (t[j]==u'\r')):
        j-=1
    return t[i:j+1]


def sozd(otch,n,t,fldob):

    n=ubr_nk_prob(n)

    if n.startswith(u'[['):
        n=n[2:]

    if len(n)>2 and n[len(n)-2:]==u']]':
        n=n[:len(n)-2]

    pl = wikipedia.Page(mysite,n)
    wikipedia.output(pl.title())

    try:
        st=pl.get()
    except wikipedia.NoPage:
        st=u''
    except wikipedia.IsRedirectPage:
        st=u'--> [['+pl.getRedirectTarget()+u']]\n----\n'

    try:
        if st!=u'' and not fldob:
            wikipedia.output(u"Page %s already exists, not adding!"%pl.title())
            otch.write(u'-  %s\n' % pl.title())
            otch.flush()
        elif st!=u'' and fldob:
            pl.put(st+u'\n\n'+t, comment = u"", minorEdit = False)
            otch.write(u'++ %s\n' % pl.title())
            otch.flush()
        else:
            pl.put(t, comment = u"", minorEdit = False)
            otch.write(u'+  %s\n' % pl.title())
            otch.flush()
        return
    except wikipedia.EditConflict:
        wikipedia.output(u'!!!EditConflict!!!\n')
        otch.write(u'!  %s\n' % pl.title())
        otch.flush()


def provstop(fstop):
    return os.access(fstop,os.F_OK)


def main(filename,fotch,fldob):

    fstop='sozd_sta_stop.txt'
    try:
        os.unlink(fstop)
    except:
        pass

    if provstop(fstop):
        wikipedia.output(u'stop0!\n')
        return

    t=u''
    n=u''
    f=codecs.open(filename,'rb',encoding='utf-8')

#    skon=u''
    nkon=0

    if os.access(fotch,os.F_OK):
        otch = codecs.open(fotch, 'rb', 'utf-8')

        for s in otch.readlines():
            if ord(s[0]) == 65279:
                s=s[1:]
            s=s.replace(u'\r',u'')
            while len(s)>0 and s[len(s)-1]==u'\n':
                s=s[:len(s)-1]
                wikipedia.output(u'nnn\n')
            s1=s[3:]
            if len(s1)>=1:
#                skon=s1
                nkon+=1
        otch.close()

#    wikipedia.output(u'skon=%s\n' % skon)

    otch = codecs.open(fotch, 'a', 'utf-8')

#    flrab=0
    i=0
    sch=0
    for s in f.readlines():
#        wikipedia.output(u'%d\n' % ord(s[0]))
        if ord(s[0]) == 65279:
            s=s[1:]
#        wikipedia.output(u'%d   %s   %s\n' % (i,n,s))
        if s[0:7] == u'=======':
            if i==1:
#                wikipedia.output(u'... "%s" "%s"\n' % (n,skon))
                wikipedia.output(u'... %d %d "%s"\n' % (sch,nkon,n))
#                if skon==u'' or flrab:
#                    sozd(otch,n,t,fldob)
#                elif n==skon:
#                    flrab=1
                if sch>=nkon:
                    sozd(otch,n,t,fldob)
                sch+=1
                if provstop(fstop):
                    wikipedia.output(u'stop!\n')
                    return
            i=0    
            t=u''
        elif i==0:
            n=ubr_nk_prob(s)
#            while len(n)>0 and n[len(n)-1]==u'\n':
#                n=n[:len(n)-1]
            i=1
        else:
            t=t+s
#            t.append(s)



#        wikipedia.output(u"No title found - skipping a page.")


#    text=''.join(text)

filename = wikipedia.argHandler(sys.argv[1], 'cht_cat')
fotch = filename+u'_ot.txt'
filename+=u'.txt'

fldob=0
if len(sys.argv)>=4 and sys.argv[2]==u'2':
    fldob=1
mysite = wikipedia.getSite()

try:
    main(filename,fotch,fldob)
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()

* sozd_alidir.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs


def sozd(n1,n2):

    pl1 = wikipedia.Page(mysite,wikipedia.UnicodeToAsciiHtml(n1))
    ple1= pl1.exists()
    ba=u''
    if ple1:
        si1=u'+'
        try:
            pl1.get()
        except wikipedia.IsRedirectPage:
            ba = u'-> %s\n' % pl1.getRedirectTarget()
    else:
        si1=u'-'
    ti1=pl1.title()
    b=u'%s %s\n%s' % (si1,ti1,ba)
    wikipedia.output(b)
    otch.write(b)
    otch.flush()

    pl2 = wikipedia.Page(mysite,wikipedia.UnicodeToAsciiHtml(n2))
    ple2= pl2.exists()
    ba=u''
    if ple2:
        si2=u'+'
        try:
            pl2.get()
        except wikipedia.IsRedirectPage:
            ba = u'-> %s\n' % pl2.getRedirectTarget()
    else:
        si2=u'-'
    ti2=pl2.title()
    b=u'%s %s\n%s' % (si2,ti2,ba)
    wikipedia.output(b)
    otch.write(b)
    otch.flush()

    if ple1 & (ple2==0):
        t=u'#REDIRECT [[%s]]' % ti1
        pl2.put(t, comment = t, minorEdit = True)
        wikipedia.output(t)
        otch.write(u'%s\n' % t)
        otch.flush()
    elif (ple1==0) & ple2:
        t=u'#REDIRECT [[%s]]' % ti2
        pl1.put(t, comment = t, minorEdit = True)
        wikipedia.output(t)
        otch.write(u'%s\n' % t)
        otch.flush()

    return


def main():
    n=u''
    f=codecs.open(filename,'rb',encoding='utf-8')

    i=0
    for s in f.readlines():
#        wikipedia.output(u'%d\n' % ord(s[0]))
        if ord(s[0]) == 65279:
            s=s[1:]
#        wikipedia.output(u'%d   %s   %s\n' % (i,n,s))
        if i==0:
            n=s
            i=1
        else:
            sozd(n,s)
            i=0    
#            t.append(s)



#        wikipedia.output(u"No title found - skipping a page.")


#    text=''.join(text)

filename = wikipedia.argHandler(sys.argv[1], 'cht_cat')
fotch = wikipedia.argHandler(sys.argv[2], 'cht_cat')

for arg in sys.argv[3:]:
    arg = wikipedia.argHandler(arg, 'pagefromfile')
    if arg:
        if arg=="-log":
            import logger
            sys.stdout = logger.Logger(sys.stdout, filename = 'pagefromfile.log')
mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')

try:
    main()
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()

* sozd_ali1.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs


def prov(n1):

    pl1 = wikipedia.Page(mysite,wikipedia.UnicodeToAsciiHtml(n1))
    ple1= pl1.exists()
    ba=u''
    if ple1:
        si1=u'+'
        try:
            pl1.get()
        except wikipedia.IsRedirectPage:
            ba = u'-> %s\n' % pl1.getRedirectTarget()
    else:
        si1=u'-'
    ti1=pl1.title()
    b=u'%s %s\n%s' % (si1,ti1,ba)
    wikipedia.output(b)
    otch.write(b)
    otch.flush()

    return (pl1,ple1,ti1)

def sozd(pl1,ple1,ti1,pl2,ple2,ti2):

    if (ple1==0) & ple2:
        t=u'#REDIRECT [[%s]]' % ti2
        pl1.put(t, comment = t, minorEdit = True)
        b=u'++ %s\n%s\n' % (ti1,t)
        wikipedia.output(b)
        otch.write(b)
        otch.flush()

    otch.write(u'\n')
    otch.flush()

    return


def main():
    n=u''
    pl2pr=None
    ple2pr=0
    ti2pr=u''
    spr=u''
    f=codecs.open(filename,'rb',encoding='utf-8')

    i=0
    for s in f.readlines():
#        wikipedia.output(u'%d\n' % ord(s[0]))
        if ord(s[0]) == 65279:
            s=s[1:]
#        wikipedia.output(u'%d   %s   %s\n' % (i,n,s))
        if i==0:
            n=s
            i=1
        else:
            (pl1,ple1,ti1)=prov(n)
            if spr==s:
                pl2=pl2pr
                ple2=ple2pr
                ti2=ti2pr
            else:
                (pl2,ple2,ti2)=prov(s)
            sozd(pl1,ple1,ti1,pl2,ple2,ti2)
            spr=s
            pl2pr=pl2
            ple2pr=ple2
            ti2pr=ti2
            i=0    
#            t.append(s)



#        wikipedia.output(u"No title found - skipping a page.")


#    text=''.join(text)

filename = wikipedia.argHandler(sys.argv[1], 'cht_cat')
fotch = wikipedia.argHandler(sys.argv[2], 'cht_cat')

for arg in sys.argv[3:]:
    arg = wikipedia.argHandler(arg, 'pagefromfile')
    if arg:
        if arg=="-log":
            import logger
            sys.stdout = logger.Logger(sys.stdout, filename = 'pagefromfile.log')
mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')

try:
    main()
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()

* saveHTML1.py

# -*- coding: utf-8  -*-
"""
(C) 2004 Thomas R. Koll, <tomk32@tomk32.de>
 Distribute under the terms of the PSF license.

This bot downloads the HTML-pages of articles and images
and saves the interesting parts, i.e. the article-text
and the footer to a file like Hauptseite.txt.

TODO:
   change the paths in the HTML-file


Options:

      -o:                Specifies the output-directory where to save the files   

      -images:           Downlaod all images
      -overwrite:[I|A|B] Ignore existing Images|Article|Both and
                         download them even if the exist


Features, not bugs:
* Won't d/l images of an article if you set -overwrite:A

"""
__version__ = '$Id: saveHTML.py,v 1.10 2005/10/13 20:10:03 leogregianin Exp $'


import wikipedia,httplib,StringIO,re,sys,md5,os, string,codecs,urllib
from htmlentitydefs import *

def extractArticle(data):
    """ takes a string with the complete HTML-file
    and returns the article which is contained in
    <div id='article'> and  the pagestats which
    contain information on last change """

    images = []
    s = StringIO.StringIO(data)
    rPagestats = re.compile('.*(\<span id\=(\"|\')pagestats(\"|\')\>.*\<\/span\>).*')
    rBody = re.compile('.*<div id\=\"content\">.*')
    rFooter = re.compile('.*<div id\=\"footer\">.*')
    rDivOpen = re.compile('.*<div ')
    rDivClose = re.compile('.*<\/div>.*')
    divLevel = 1
    divLast = -1
    inArticle = 0
    inFooter  = 0
    result = {'article':"",
              'footer':""}
    for line in s:
        if line == "<p><br /></p>":
            continue
        line = line.replace(" ", " ")
        line = line.replace(" ", " ")

        if rDivOpen.match(line):
            divLevel = divLevel + 1
        if rBody.match(line):
            inArticle = 1
            divLast = divLevel-2
        elif rFooter.match(line):
            divLast = divLevel-1
            inFooter  = 1
        if inArticle:
            result['article'] += line
        elif inFooter:
            result['footer'] += line
        if rDivClose.match(line):
            divLevel = divLevel - 1
            if divLevel == divLast:
                inArticle = 0
                inFooter = 0
                divLast = -1


    return result

def html2txt(str):
    dict = {"%C3%A4": "ä",
            "%C3%B6": "ö",
            "%C3%BC": "ü",
            "%C3%84": "Ä",
            "%C3%96": "Ö",
            "%C3%9C": "Ü",
            "%C3%9F": "ß",
            "%27": "'",
            "%28": "(",
            "%29": ")",
            "%2C": ","
            }
            
    for entry in dict:
        str = re.sub(entry, dict[entry], str)
    return str

def extractImages(data):
    """ takes a string with the complete HTML-file
    and returns the article which is contained in
    <div id='article'> and  the pagestats which
    contain information on last change """

    images = []
    rImage = re.compile('<a href=[\r\n]*?"/wiki/.*?:(.*?)".*?[\r\n]*?.*?class=[\r\n]*?"image"', re.MULTILINE)
    rThumb = re.compile('<a href=[\r\n]*?"/wiki/.*?:(.*?)".*?[\r\n]*?.*?class=[\r\n]*?"internal".*?[\r\n]*?.*?<img', re.MULTILINE or re.DOTALL)
    last = ""
    img = rImage.findall(data)
    timg = rThumb.findall(data)
    for i in timg:
        try:
            img.index(i)
        except:
            img.append(i)
    print "Bilder: ", img

    for image in img:
        path = md5.new(html2txt(image)).hexdigest()
        images.append( {'image': image,
                        'path' : str(path[0])+"/"+str(path[0:2])+"/"})
    images.sort()
    return images


def main():
    mysite = wikipedia.getSite()
    save_images = 0

    fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
    fkat = fi+'.txt'
    fzap = fi
    fotch = fi+'_ot.txt'

    if len(sys.argv)>=3:
        save_images = 1

    headers = {"Content-type": "application/x-www-form-urlencoded", 
               "User-agent": "RobHooftWikiRobot/1.0"}
    print "opening connection to", mysite.hostname(),
    conn = httplib.HTTPConnection(mysite.hostname())
    print " done"

    R = re.compile('.*/wiki/(.*)')

    f=codecs.open(fkat,'r',encoding='utf-8')
    otch = codecs.open(fotch, 'wb', 'utf-8')

    t1=f.readlines()
    nomerf=0
    for s in t1:
        if ord(s[0]) == 65279:
            s=s[1:]
        s=s.replace(u'\r',u'')
        if s[len(s)-1]==u'\n':
            s=s[:len(s)-1]
        if len(s)<1:
            continue

#    for article in sa:
#        filename = article.replace("/", "_")
#        filename = filename.replace(":", "_")
#        if os.path.isfile(output_directory + filename + ".txt") and overwrite_articles == False:
#            print "skipping " + article
#            continue
        data = ""
        ua = s
        enua = ua.replace(u' ', u'_').encode('utf-8')
        enua = urllib.quote(enua)

        url = '/wiki/'+ enua
        conn.request("GET", url, "", headers)
        response = conn.getresponse()
        data = response.read()
        if len(data) < 2:
            b = u'-       %s\n' % ua
            wikipedia.output(b)
            otch.write(b)
            otch.flush()

#        data = extractArticle(data)

        f = open(u'%s%06d.htm' % (fzap,nomerf), 'wb')
#        f.write(data['article'] + '\n' + data['footer'])
        f.write(data)
        f.flush()
        f.close()
        b = u'%06d  %s\n' % (nomerf,ua)
        wikipedia.output(b)
        otch.write(b)
        otch.flush()
        nomerf+=1

        if save_images:
#            images = extractImages(data['article'])
            images = extractImages(data)
            for i in images:
#                if overwrite_images == False and os.path.isfile(output_directory + i['image']):
#                    print "skipping existing " + i['image']
#                    continue
                print 'downloading ' + i['image'],
                uo = wikipedia.MyURLopener()
                file = uo.open( "http://upload.wikimedia.org/wikipedia/"
                                +mysite.lang + '/' + i['path'] + i['image'])
                content = file.read()
                if (len(content) < 500):
                    uo.close()
                    print "downloading from commons",
                    uo = wikipedia.MyURLopener()
                    file = uo.open( "http://commons.wikimedia.org/upload/"
                                    + i['path'] + i['image'])
                    #print "http://commons.wikimedia.org/upload/", i['path'] , i['image'], file
                    content = file.read()
#                f = open(output_directory + i['image'], "wb")
                f = open(u'%s%06d_%s' % (fzap,nomerf,i['image']), 'wb')
                f.write(content)
                f.close()
                b = u' +i %06d  %db  %s\n' % (nomerf,len(content),i['image'])
                wikipedia.output(b)
                otch.write(b)
                otch.flush()
#                print "\t\t", (len(content)/1024), "KB done"
    conn.close()

if __name__ == "__main__":
    main()

* podg_v_wiki.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs


def iskat(t,s):
    lt=len(t)
    ls=len(s)
    i=0
    for a in t[:lt-ls]:
        if t[i:i+ls]==s:
            return i
        i+=1
    return -1

def ubr_nk_prob(t):
    if len(t)<1:
        return t
    while (t[0]==u' ') or (t[0]==u'\n') or (t[0]==u'\t') or (t[0]==u'\r'):
        t=t[1:]
        if len(t)<1:
            return t
    while (t[len(t)-1]==u' ') or (t[len(t)-1]==u'\n') or (t[len(t)-1]==u'\t') or (t[len(t)-1]==u'\r'):
        t=t[:len(t)-1]
        if len(t)<1:
            return t
    return t


def main(ft,fvih):
    f=open(ft,'rb')
    fperv=1
#    rez=''
    for s in f.readlines():
#        if ord(s[0]) == 65279:
#            s=s[1:]
        s=s.replace('\r','')
        if s[len(s)-1]=='\n':
            s=s[:len(s)-1]
        if len(s)<1:
            continue
        if s[0]=='p':
            fi=ubr_nk_prob(s[2:])
            f1=open(fi,'rb')
            rez1=f1.read()
            if fperv:
                rez=rez1
                rez=rez[:0]
            rez+='* %s\n<code><pre><nowiki>\n'%fi
            print type(rez)
            print type(rez1)
            rez+=rez1
            print type(rez)
            print type('

') rez+='</nowiki>'

  1. rez1='* %s\n
    \n'%fi+rez1+'
    '
           if fperv:
  1. rez=rez1
               fperv=0
  1. else:
  2. rez+=rez1
           f1.close()
   fr=open(fvih, 'wb')
   fr.write(rez)
   fr.flush()
   fr.close()


if __name__ == "__main__":

   try:
       ft = wikipedia.argHandler(sys.argv[1], 'perevod')
       fvih = wikipedia.argHandler(sys.argv[2], 'perevod')
       main(ft,fvih)
   finally:
       wikipedia.stopme()

</nowiki>* zamen.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
from bib_tekst import *

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()

def sravn_opc(t,i,s,opc):
    ls=len(s)
    if i+ls>len(t):
        return 0
    k=0
    while k<ls:
        a=t[i+k]
        b=s[k]
        if (u'k' in opc) and ( (k>0 and (s[k-1] in u'[:')) or 
                               (k==0 and (u'l' in opc)) ):
            a=a.upper()
            b=b.upper()
        if a!=b:
            return 0
        k+=1
    return 1

def iskat_opc(t,i,s0,opc):
    ss=[s0]
    if (u's' in opc) and len(s0)>2 and s0[len(s0)-2:]==u']]':
        ss.append(s0[:len(s0)-2]+u'|')

    lt=len(t)
    while i<lt:
        if (u'l' in opc) and (t[i:i+2]==u'[[' or t[i:i+2]==u'{{'):
            pn=i+2
            (pk,ik)=iskats_mn(t,pn,[u'[[',u']]',u'{{',u'}}',u'|',u'#'])
            if pk==-1:
                pk=lt
            while pn<pk and (t[pn] in [u' ',u'\n',u'\t',u'\r']):
                pn+=1
            if pn<pk and t[pn]==u':' and s0[0]!=u':':
                pn+=1
            while pn<pk and (t[pk-1] in [u' ',u'\n',u'\t',u'\r']):
                pk-=1

            ssilk=t[pn:pk].replace(u'_',u' ').replace(u'  ',u' ')
#            ssilk=perv_upper(ubr_nk_prob(ssilk))
#            vivod(u'%d %d %d %d %s %s\n'%(pn,pk,len(ssilk),len(s0),ssilk,s0))
            if len(ssilk)==len(s0) and sravn_opc(ssilk,0,s0,opc):
                return (pn,pk)
            i=pk
        for j in range(len(ss)):
            s=ss[j]
            ls=len(s)
            ravn=sravn_opc(t,i,s,opc)
            if ravn:
                if j==1:
                    m=i+ls
                    m1=iskkonpodp(t,m,u'[[',u']]')
                    if m1!=-1:
                        return (i,m1+2)
                return (i,i+ls)
        i+=1
    return (-1,-1)


def zam(n,tzfl,tzst,tzno,kol,komm):


    pl = wikipedia.Page(mysite,wikipedia.UnicodeToAsciiHtml(n))
    wikipedia.output(pl.title())
#    pl._contents=u'qwe [[qwe]] [[qwer|qwe]] [[fqwe]] qwe :qwe {{wqwe}} [[qwe|qwer]] [qwe] sqwr dwer [[:qwe]] qwet dqwe gqwej [[enn:qwe]] [[qwey]]\n'
#    pl._contents+=u'Qwe [[Qwe]] [[Qwer|Qwe]] [[fQwe]] Qwe :Qwe {{wQwe}} [[Qwe|Qwer]] [Qwe] sQwr dwer [[:Qwe]] Qwet dQwe gQwej [[enn:Qwe]] [[Qwey]]'


    try:
        nt = pl.get()
    except wikipedia.NoPage:
        b = u'- %s\n\n' % pl.title()
        vivod(b)
        return
    except wikipedia.IsRedirectPage:
        b = u'+ %s\n-> %s\n\n' % (pl.title(),pl.getRedirectTarget())
        vivod(b)
        return

    otch.write(u'+ %s\n' % pl.title())
    otch.flush()

    otch2.write(u'=========================\n%s\n-=======\n%s\n' % (pl.title(),nt))
    otch2.flush()

    j=0
    fliwi=1
    while j<kol:
        if (u'w' in tzfl[j]):
            fliwi=0
        j+=1

    if fliwi:
        oiwi = pl.interwiki()
        niwi = {}
        for pl2 in oiwi:
            if pl.site()!=pl2.site():
                niwi[pl2.site()] = pl2
        nt = wikipedia.replaceLanguageLinks(nt, {})

    nt = nt.replace(u'\r',u'')
#    while u' \n' in nt:
#        nt = nt.replace(u' \n',u'\n')

    fzam=0

    j=0
    while j<kol:
        zst=tzst[j]
        zst=zst[:len(zst)-1]
        zno=tzno[j]
        zno=zno[:len(zno)-1]
#        if mysite.lang==u'eo':
#            zst=zst.replace(u'Ĉ',u'Cx').replace(u'ĉ',u'cx')
#            zst=zst.replace(u'Ĝ',u'Gx').replace(u'ĝ',u'gx')
#            zst=zst.replace(u'Ĵ',u'Jx').replace(u'ĵ',u'jx')
#            zst=zst.replace(u'Ĥ',u'Hx').replace(u'ĥ',u'hx')
#            zst=zst.replace(u'Ŝ',u'Sx').replace(u'ŝ',u'sx')
#            zst=zst.replace(u'Ŭ',u'Ux').replace(u'ŭ',u'ux')
#            zno=zno.replace(u'Ĉ',u'Cx').replace(u'ĉ',u'cx')
#            zno=zno.replace(u'Ĝ',u'Gx').replace(u'ĝ',u'gx')
#            zno=zno.replace(u'Ĵ',u'Jx').replace(u'ĵ',u'jx')
#            zno=zno.replace(u'Ĥ',u'Hx').replace(u'ĥ',u'hx')
#            zno=zno.replace(u'Ŝ',u'Sx').replace(u'ŝ',u'sx')
#            zno=zno.replace(u'Ŭ',u'Ux').replace(u'ŭ',u'ux')
#        if (u'u' in tzfl[j]) and (zno.lower() in nt.lower()):
        if (u'u' in tzfl[j]):
            (p,pk)=iskat_opc(nt,0,zno,tzfl[j])
            if p!=-1:
                b=u'uzxe\n'
                wikipedia.output(b)
                otch.write(b)
                otch.flush()
                j+=1
                continue

#        nt = nt.replace(zst,zno)
        nz=0
        p0=0
        while (len(zno)>0 or len(zst)>0) and p0<len(nt):
            if zst==u'':
                if (u'n' in tzfl[j]):
                    p=0
                else:
                    p=len(nt)
                pk=p
            else:
#                p=iskats(nt,p0,zst)
                (p,pk)=iskat_opc(nt,p0,zst,tzfl[j])
            if p==-1:
                break
            nt=nt[:p]+zno+nt[pk:]
            p0=p+len(zno)
            nz+=1
            fzam=1
            if zst==u'':
                break

        b=u'%d\n' % nz
        wikipedia.output(b)
        otch.write(b)
        otch.flush()
        j+=1

    if not fliwi:
        pl1=wikipedia.Page(pl.site(),pl.title())
        pl1._contents=nt
        oiwi = pl1.interwiki()
        niwi = {}
        for pl2 in oiwi:
            if pl.site()!=pl2.site():
                niwi[pl2.site()] = pl2
        nt = wikipedia.replaceLanguageLinks(nt, {})

    if fzam:
        while nt[len(nt)-1:]==u'\n':
            nt=nt[:len(nt)-1]
#        if fliwi:
        nt+=wikipedia.replaceLanguageLinks(u'', niwi)

        otch2.write(u'+=======\n%s\n' % nt)
        otch2.flush()
        pl.put(nt, comment = komm, minorEdit = False)
    otch.write(u'\n')
    otch.flush()
    return


def main():
    zfl=0
    tzfl={}
    tzst={}
    tzno={}
    komm=u''
    n=[]
    f=codecs.open(filename,'rb',encoding='utf-8')

    i=0
    j=0
    for s in f.readlines():
#        wikipedia.output(u'%d\n' % ord(s[0]))
        if ord(s[0]) == 65279:
            s=s[1:]
#        wikipedia.output(u'%d %d  %s   %s\n' % (i,j,n,s))
        s=s.replace(u'\r',u'')
        if s[len(s)-1]!=u'\n':
            s=s+u'\n'

        if s[0:12] == u'============':
#            wikipedia.output(u'k%d %d  %s   %s\n' % (i,j,n,s))
            if i==100:
                b=u'!er %s\n' % n
                wikipedia.output(b)
                otch.write(b)
                otch.flush()
            if (i!=100) and (j>0):
#                wikipedia.output(u'kl%d %d  %s   %s\n' % (i,j,n,s))
                for n1 in n:
                    zam(n1,tzfl,tzst,tzno,j,komm)
            n=[]
            komm=u''
            i=0
            j=0
        elif i==0:
            if s[0:8]==u'?=======':
                i=2
            elif s[0]==u'|':
                komm=s[1:]
            elif s[0]!=u'#':
                n.append(s)
        elif i==2:
            zfl= (s[0]!=u'0')
            if zfl:
                tzfl[j]=s
                tzst[j]=u''
                tzno[j]=u''
                j+=1
            i=3
        elif i==3:
            if s[0:8]==u'-=======':
                i=4
            else:
                i=100
                wikipedia.output(u"er i==3\n")
        elif i==4:
            if s[0:8]==u'+=======':
                i=5
            else:
                if zfl:
                    tzst[j-1]+=s
        else: #if i==5:
            if s[0:8]==u'?=======':
                i=2
            else:
                if zfl:
                    tzno[j-1]+=s



#        wikipedia.output(u"No title found - skipping a page.")


#    text=''.join(text)

fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
filename = fi+'.txt'
fotch = fi+'_ot.txt'
fotch2 = fi+'_op.txt'

#filename = wikipedia.argHandler(sys.argv[1], 'cht_cat')
#fotch = wikipedia.argHandler(sys.argv[2], 'cht_cat')
#fotch2 = wikipedia.argHandler(sys.argv[3], 'cht_cat')

#fliwi=1
#if len(sys.argv)>=3 and sys.argv[2]==u'w':
#    fliwi=0

otch = codecs.open(fotch, 'w', 'utf-8')
otch2 = codecs.open(fotch2, 'w', 'utf-8')
mysite = wikipedia.getSite()

try:
    main()
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()

* perev_kat.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys, os
import codecs
import perevod
from bib_tekst import *
from bib_kateg import *
from get_buf import *

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()


def main(slov,vhjaz,vihjaz,fivh,firez):

    f=codecs.open(fivh,'rb',encoding='utf-8')
    frez=codecs.open(firez, 'w', 'utf-8')
    ish=[]
    for s in f.readlines():
        if ord(s[0]) == 65279:
            s=s[1:]
        s=ubr_nk_prob(s)
        ish.append(s)
    f.close

    ish.sort()
    vihsi=wikipedia.getSite(code = vihjaz,fam = u'wikipedia')

    spred=u''
    for s in ish:
        if s==spred:
            continue
        spred=s
        st=s.split(u':',2)
        if len(st)!=3:
            vivod(u'!!! len(st)!=3 s=%s\n'%s)
            continue
        kato=[Kateg(st[0],st[2])]
        nt_kat_sp=perev_kateg(slov,nssvoj,vihsi,kato,1,otch)
        otch.write(u'\n\n===============\n\n')
        otch.flush()
        for t in nt_kat_sp:
            frez.write(u'%s:%s\n%s\n\n'%(vihjaz,t,s))
            frez.flush()


if __name__ == "__main__":
    try:
        vhjaz = wikipedia.argHandler(sys.argv[1], 'perevod')
        vihjaz = wikipedia.argHandler(sys.argv[2], 'perevod')
        fslov = wikipedia.argHandler(sys.argv[3], 'perevod')
        fvh = wikipedia.argHandler(sys.argv[4], 'perevod')
        fi = wikipedia.argHandler(sys.argv[5], 'perevod')
        fvih= fi+u'.txt'
        fotch = fi+u'-ot.txt'
#        fotch2 = fi+u'-op.txt'
#        fipoln = fi+u'-pp.txt'
#        finesl = fi+u'-ns.txt'
#        fink = fi+u'-nk.txt'

        if vhjaz==u'-':
            vhjaz=u''

        otch = codecs.open(fotch, 'w', 'utf-8')
#        otch2 = codecs.open(fotch2, 'w', 'utf-8')
        slov={}
        nssvoj={}
        slovdop=[]
        perevod.zagruzslov(slov,slovdop,nssvoj,vhjaz,vihjaz,fslov)
        main(slov,vhjaz,vihjaz,fvh,fvih)
    except:
        wikipedia.stopme()
        raise
    else:
        wikipedia.stopme()

* perev_oivi.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys, os
import codecs
import perevod
from bib_tekst import *
from bib_kateg import *

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()


def sozd(vhjaz,vihjaz,n,t):

    vhsi=wikipedia.getSite(code = vhjaz,fam = u'wikipedia')
    vihsi=wikipedia.getSite(code = vihjaz,fam = u'wikipedia')

    pl=wikipedia.Page(vhsi,n)
    pl._contents=t

    oiwi = pl.interwiki()
    s=u''
    for pl2 in oiwi:
        if pl2.site()==vihsi:
            s=pl2.title()

    if s==u'':
        return u''

    b=((u'|interwiki\n%s\n'+
            u'?=======\n1wu\n-=======\n+=======\n\n[[%s:%s]]\n'+
            u'============\n')%(s,vhjaz,n))

    return b

def main(vhjaz,vihjaz,filename,firez):

    t=u''
    n=u''
    f=codecs.open(filename,'rb',encoding='utf-8')

    frez=codecs.open(firez, 'w', 'utf-8')
    i=0
    for s in f.readlines():
        if ord(s[0]) == 65279:
            s=s[1:]
        if s[0:7] == u'=======':
            if i==1:
                rez=sozd(vhjaz,vihjaz,n,t)
                if rez!=None:
                    frez.write(rez)
                    frez.flush()
            i=0    
            t=u''
        elif i==0:
            n=ubr_nk_prob(s)
            i=1
        else:
            t=t+s


if __name__ == "__main__":
    try:
        vhjaz = wikipedia.argHandler(sys.argv[1], 'perevod')
        vihjaz = wikipedia.argHandler(sys.argv[2], 'perevod')
        fvh = wikipedia.argHandler(sys.argv[3], 'perevod')
        fi = wikipedia.argHandler(sys.argv[4], 'perevod')
        fvih= fi+u'.txt'
#        fotch = fi+u'-ot.txt'
#        otch = codecs.open(fotch, 'w', 'utf-8')
        main(vhjaz,vihjaz,fvh,fvih)
    except:
        wikipedia.stopme()
        raise
    else:
        wikipedia.stopme()

* perev_sta.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys, os
import codecs
import perevod
from bib_tekst import *
from bib_kateg import *

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()

def prov_dobav_katcom1(nscom,zapriz,katcomsp,n):
    n=perv_upper(ubr_nk_prob(n))
    vivod(u' prov_dobav_katcom1 %s\n'%n)
    p=iskats(n,0,u'#')
    if p!=-1:
        n=n[:p]
        vivod(u' prov_dobav_katcom1# %s\n'%n)
    t=u'Category:'+n
    if nscom.has_key(t) and (not zapriz.has_key(n)):
        t1=u'[['+t+u']]\n'
        if not t1 in katcomsp:
            katcomsp.append(t1)
            vivod(u' <-- ++\n')
            return 1
    return 0

def prov_dobav_katcom(nscom,zapriz,katcomsp,vhs,jaz):
    prov_dobav_katcom1(nscom,zapriz,katcomsp,vhs)
    if jaz==u'en':
        vhok=vhs[len(vhs)-1]
        if vhok==u'y':
            vhs1=vhs[:len(vhs)-1]+u'ies'
            prov_dobav_katcom1(nscom,zapriz,katcomsp,vhs1)
        else: 
            vhs1=vhs+u's'
            prov_dobav_katcom1(nscom,zapriz,katcomsp,vhs1)
            vhs1=vhs+u'es'
            prov_dobav_katcom1(nscom,zapriz,katcomsp,vhs1)

def sozd_katcom(nscom,zapriz,nomio,kat):

    katcomsp=[]
    katprob=[]

    for (n,jaz) in nomio:
        katprob.append( (n,jaz) )
        prov_dobav_katcom(nscom,zapriz,katcomsp,n,jaz)

#    if len(katcomsp)<1:
#        shcom_v_katcom(nscom,zapriz,katcomsp,shcomi+shcomo)

    if len(katcomsp)<1:
        vivod(u'- katcom nomio\n')
        for ka in kat:
            katprob.append( (ka.n,ka.jaz) )
            prov_dobav_katcom(nscom,zapriz,katcomsp,ka.n,ka.jaz)

    if len(katcomsp)<1:
        vivod(u'- katcom\n')
        for (n1,jaz) in katprob:
            n2=n1.replace(u'(',u' ').replace(u')',u' ')
            for n in n2.split(u' '):
                if len(n)>=4:
                    prov_dobav_katcom(nscom,zapriz,katcomsp,n,jaz)

    if len(katcomsp)<1:
        vivod(u'- katcom2\n')

    katcom=u''
    for t in katcomsp:
        katcom+=t

    vivod(u'========katcom=\n%s==========\n' % katcom)
    return katcom

def prov_te_format_kart(t):
    minrzm=50
    t=ubr_nk_prob(t)
    if (t==u'thumb') or (t==u'thumbnail') or (t==u'frame') or (t==u'framed'):
        return 1
    if (t==u'left') or (t==u'center') or (t==u'right'):
        return 1
    if len(t)<3:
        return 0
    x=0
    i=0
    while i<6 and i<len(t) and t[i].isdigit():
        x=x*10+ord(t[i])-ord(u'0')
        i+=1
    if (i>=1) and (t[i:]==u'px'):
        if x<minrzm: 
            return 2
        return 1
    return 0

class Pereved_sta:
    def __init__(self, vhpn,n,t,statn,stat):
        self.vhpn = vhpn
        self.n = n
        self.n2 = n
        self.t = t
        self.nkat = 0
        self.statn = statn
        self.stat = stat
        self.prioritet = 0
        self.npredl = 0
    def vz_prioritet(self):
        return self.prioritet

def sum_per_stat(s,s1):
    if s1.flp:
        s.flp = 1
    s.nup += s1.nup
    s.nnp += s1.nnp
    s.nvap += s1.nvap
    s.snp += s1.snp
    s.svap += s1.svap


def sozd(slov,nscom,zapriz,sp_obr_st,vhjaz,vihjaz,n,t,snka,sniz,snizkat,
                              schss,tpref,
                              fl_d_iwi_n,fl_b_tekst):

    vivod(u'\n\n\n%s\n'%n)

    vhpn=vhjaz+u':'+n
    if fl_d_iwi_n and not sp_obr_st.has_key(vhpn):
        vivod(u'<- net v tbl\n')
        return
    if slov.has_key((vhjaz,n)) and slov[(vhjaz,n)].riwi:
        vivod(u'<- uzxe v real ivi\n')
        return

    vhsi=wikipedia.getSite(code = vhjaz,fam = u'wikipedia')
    vihsi=wikipedia.getSite(code = vihjaz,fam = u'wikipedia')
    vhprefi=vhsi.family.image_namespace(vhjaz)
    vihprefi=vihsi.family.image_namespace(vihjaz)

    pl=wikipedia.Page(vhsi,n)
    pl._contents=t

    nomio=[(n,vhjaz)]

    oiwi = pl.interwiki()
    niwi = {}
    for pl2 in oiwi:
        if pl2.site()==vihsi:
            vivod(u'uzxe vihjaz %s %s\n'%(n,pl2.title()))
            return
        if pl2.site()!=vhsi:
            niwi[pl2.site()] = pl2
            nomio.append( (pl2.title(),pl2.site().lang) )
    niwi[vhsi]=pl

    plkat = pl.categories()
    kato=[]
    for pka in plkat:
        kato.append(Kateg(vhjaz,pka.title(),1))

    t = wikipedia.removeLanguageLinks(t)
    t = wikipedia.removeCategoryLinks(t,vhsi)

    rezn_t=perevod.perevod_iwi_spis(slov,vhjaz,vihjaz,n)
    if len(rezn_t)>0:
        fl_iwi_n=1
        rezn=rezn_t[0]
        statn=perevod.Perev_stat()
    else:
        fl_iwi_n=0
        (rezn,statn)=perevod.perevod_stat(slov,vhjaz,vihjaz,n)

    vihpl=wikipedia.Page(vihsi,rezn)

    t = t.replace(u'\r',u' ').replace(u'\t',u' ')
    while u'  ' in t:
        t = t.replace(u'  ',u' ')
    t = t.replace(u' \n',u'\n')

    tks=[u'. ',u'.\n',u'\n\n',u'\n ',u'\n:',u'\n*',u'[[',u'{{',u'\n=',u'=\n',
           u'<gallery>',u'<math>',u']]',]

    tkzs={u'[[':u']]',u'{{':u'}}',u'<gallery>':u'</gallery>',
               u'<math>':u'</math>'}

    st=[]
    rt=u''
    sst=[]
    uktbl=[]
    npredl=0
    snizdl0=len(sniz)

    p=0
    while (not fl_b_tekst) and p<len(t):
        (p1,i)=iskats_mn(t,p,tks)

#        vivod(u'nach predl %d %d\n'%(p1,i))

        if p1==-1:
            p1=len(t)  
#        elif tks[i][0]==u'.' or tks[i][0]==u'=':
#        else:
        elif not tkzs.has_key(tks[i]):
            p1+=1

        rt+=t[p:p1].replace(u'[[',u' ').replace(u']]',u' ')

        kon_predl=1
        ss_vih=u''

        if tks[i]==u'[[':
            os=u'[['
            zs=u']]'
            p2=iskkonpodp(t,p1+len(os),os,zs)
            if p2==-1:
                p2=len(t)
            (pr,ir)=iskats_mn(t,p1+len(os),[u'|',os,zs])
            pnt=p1+len(os)
            pks=p2
            if pr!=-1 and ir==0:
                pnt=pr+1
                pks=pr
            ss=ubr_nk_prob(t[p1+len(os):pks])
            ss_pdt=iskats(ss,1,u':')

            if ss_pdt==-1:
                kon_predl=0

                perss=perevod.perevod_iwi_spis(slov,vhjaz,vihjaz,ss)
                p3=p2+len(zs)
                p4=p3
                if len(perss)>=1:
                    while p4<len(t) and t[p4].isalpha():
                        p4+=1

                    uktbl.append(len(rt))
                    rt+=( (t[pnt:p2]+t[p3:p4]).
                                 replace(u'[[',u' ').
                                 replace(u']]',u' ')
                          )
                    uktbl.append(len(rt))
                    sst.append(perss[0])
                    if schss.has_key(perss[0]):
                        schss[perss[0]]+=1
                    else:
                        schss[perss[0]]=1

                else:
                    rt+=t[pnt:p2].replace(u'[[',u' ').replace(u']]',u' ')
                p=p4
            elif RBR(vhprefi,ss[:ss_pdt]) or RBR(u'Image',ss[:ss_pdt]):
                ss=ss[ss_pdt+1:]
                ss=perv_upper(ubr_nk_prob(ss.replace(u'_',u' ')))
                if nscom.has_key(u'Image:'+ss):
                    ss_vih=vihprefi+u':'+ss
                else:
                    ss_vh=vhprefi+u':'+ss
                    perss=perevod.perevod_iwi_spis(slov,vhjaz,vihjaz,ss_vh)
                    if len(perss)>=1:
                        ss_vih=perss[0]
                    else:
                        b=u'[['+vhjaz+u':'+ss_vh+u']]'
                        if not b in sniz:
                            sniz.append(b)
                            snizkat.append(u'')

        if kon_predl:
#            st.append(rt)
            while u'  ' in rt:
                rt = rt.replace(u'  ',u' ')
            if len(rt)>=20:
                npredl+=1
            (r1,s1)=perevod.perevod_stat(slov,vhjaz,vihjaz,rt,uktbl=uktbl)
            qi=len(sst)-1
            while qi>=0:
                ss=sst[qi]
                q1=uktbl[qi*2]
                q2=uktbl[qi*2+1]
                if perv_upper(ss)==perv_upper(r1[q1:q2]):
                    r1=r1[:q1]+u'[['+r1[q1:q2]+u']]'+r1[q2:]
                else:
                    r1=r1[:q1]+u'[['+ss+u'|'+r1[q1:q2]+u']]'+r1[q2:]
                qi-=1

            st.append((r1,s1))
            rt=u''
            sst=[]
            uktbl=[]

            if tkzs.has_key(tks[i]):
                os=tks[i]
                zs=tkzs[tks[i]]
                p2=iskkonpodp(t,p1+len(os),os,zs)
                if p2==-1:
                    p=len(t)
                else:
                    p=p2+len(zs)  
                if tks[i]==u'<math>':
                    st.append((t[p1:p],perevod.Perev_stat()))
                elif ss_vih!=u'':
                    pa=p1+len(os)
                    while 1:
                        (pr,ir)=iskats_mn(t,pa,[u'|',zs])
                        if pr==-1:
                            pr=len(t)
                        if pa==p1+len(os) or prov_te_format_kart(t[pa:pr]):
                            pa=pr
                            if pa<len(t) and t[pa]==u'|':
                                pa+=1
                            continue
                        break
                    st.append((u'[['+ss_vih+u'|'+t[pnt:pa],
                                              perevod.Perev_stat()))
                    p=pa
            else:
                p=p1


    rez=u''
    stat=perevod.Perev_stat()
    for (r1,s1) in st:
        rez+=r1
        sum_per_stat(stat,s1)

    while u'  ' in rez:
        rez = rez.replace(u'  ',u' ')
    while u'\n\n\n' in rez:
        rez = rez.replace(u'\n\n\n',u'\n\n')
    rez=ubr_nk_prob(rez)

    nt_kat_sp=perev_kateg(slov,nssvoj,vihsi,kato,0,otch)

    if len(nt_kat_sp)<1:
        for ka in kato:
            jaz=ka.jaz
            issite=wikipedia.getSite(jaz,fam = pl.site().family)
            iskatprefi=issite.family.category_namespace(jaz)
            b=jaz+u':'+iskatprefi+u':'+ka.n
            if not b in snka:
                snka.append(b)

    nt_kat=kateg_v_tekst(nt_kat_sp)

    rez+=u'\n\n'+wikipedia.replaceLanguageLinks(nt_kat, niwi, site=vihsi)

    ps=Pereved_sta(vhpn,rezn,rez,statn,stat)
    ps.nkat=len(nt_kat_sp)

    osnp={}
    for a in statn.snp:
        osnp[a]=1
    for a in stat.snp:
#        vivod(u'%s\n'%a)
#        if not osnp.has_key(a):
        osnp[a]=1


    if snizdl0<len(sniz):
        katcom=sozd_katcom(nscom,zapriz,nomio,kato)
        katcom=katcom.replace(u'\n',u' ')
        i=snizdl0
        while i<len(sniz):
            snizkat[i]=katcom
            i+=1


    ps.npredl=npredl
    prioritet = (len(osnp)+statn.nvap)*10000+stat.nvap*10

    if not fl_b_tekst:
        if npredl<3:
            prioritet+=100000000
        else:
            prioritet/=npredl
        if tpref!=u'':
            ps.t=u'{{polurinda movu|%s}}\n'%rezn+ps.t
            ps.n2=tpref+ps.n
        else:
            ps.t=u'{{polurinda}}\n'+ps.t

    if len(nt_kat_sp)<1:
        prioritet+=10000000

    if fl_d_iwi_n and not fl_iwi_n:
        prioritet+=200000000

    ps.prioritet=prioritet

    vivod(u'%d  %d  %s\n\n'%(ps.prioritet,ps.npredl,n))

    return ps


def main(slov,nscom,zapriz,sp_obr_st,vhjaz,vihjaz,
                            filename,firez,finesl,finka,finiz,fiuzxe,fipref,
                            fl_d_iwi_n,fl_b_tekst,fl_schss):

    t=u''
    n=u''
    f=codecs.open(filename,'rb',encoding='utf-8')
    vivod(u'fipref=%s\n'%fipref)
    tpref=u''
    if fipref!=u'.':
        fpref=codecs.open(fipref, 'rb', 'utf-8')
        s=fpref.readlines()[0]
        fpref.close()
        if len(s)>0 and ord(s[0]) == 65279:
            s=s[1:]
        tpref=ubr_nk_prob(s)
    vivod(u'tpref=%s\n'%tpref)

    frez=codecs.open(firez, 'w', 'utf-8')
    fnesl=codecs.open(finesl, 'w', 'utf-8')
    fnka=codecs.open(finka, 'w', 'utf-8')
    fniz=codecs.open(finiz, 'w', 'utf-8')
    fuzxe=codecs.open(fiuzxe, 'w', 'utf-8')
    tps=[]
    snka=[]
    sniz=[]
    snizkat=[]
    schss={}
    i=0
    for s in f.readlines():
#        wikipedia.output(u'%d\n' % ord(s[0]))
        if ord(s[0]) == 65279:
            s=s[1:]
#        wikipedia.output(u'%d   %s   %s\n' % (i,n,s))
        if s[0:7] == u'=======':
            if i==1:
                rez=sozd(slov,nscom,zapriz,sp_obr_st,vhjaz,vihjaz,
                                   n,t,snka,sniz,snizkat,
                                   schss,tpref,fl_d_iwi_n,fl_b_tekst)
                if rez!=None:
                    tps.append(rez)
            i=0    
            t=u''
        elif i==0:
            n=s
            while len(n)>0 and n[len(n)-1]==u'\n':
                n=n[:len(n)-1]
            i=1
        else:
            t=t+s

    for t in snka:
        fnka.write(u'%s\n'%t)
        fnka.flush()
    for i in range(len(sniz)):
        fniz.write(u'%s %s\n'%(sniz[i],snizkat[i]))
        fniz.flush()

    if fl_schss==1:
        for tp in tps:
            if schss.has_key(tp.n):
                tp.prioritet=-schss[tp.n]
            else:
                tp.prioritet=1000
    elif fl_schss==2:
        for tp in tps:
            tp.prioritet=-tp.nkat

#    tps.sort(key=Pereved_sta.vz_prioritet,reverse=True)
    tps.sort(key=Pereved_sta.vz_prioritet)

    osnpo={}
    osvapo={}
    kolotch=50
    i=0
    while i<len(tps):

        pr0=tps[i].prioritet
        osnp={}
        osvap={}
        osnp1={}
        osvap1={}
        j=i
        while j<len(tps) and pr0==tps[j].prioritet:
            frez.write(u'%s\n%s\n========\n' % (tps[j].n2,tps[j].t))
            frez.flush()
            fuzxe.write(u'%s\n' % tps[j].vhpn)
            fuzxe.flush()

            perevod.perev_uch_nezsl(osnp,osvap,tps[j].statn)
            perevod.perev_uch_nezsl(osnp,osvap,tps[j].stat)

            for sl in osnp.iterkeys():
                if not osnpo.has_key(sl):
                    osnp1[sl]=osnp[sl]

            for sl in osvap.iterkeys():
                if not osvapo.has_key(sl):
                    osvap1[sl]=osvap[sl]

            perevod.perev_uch_nezsl(osnpo,osvapo,tps[j].statn)
            perevod.perev_uch_nezsl(osnpo,osvapo,tps[j].stat)

            j+=1

        fnesl.write(u'prior=%d  n=%d\n========\n' % (pr0,j-i))
        fnesl.flush()
        perevod.perev_pech_nezsl(slov,vhjaz,vihjaz,osnp1,osvap1,fnesl,kolotch)
        fnesl.write(u'=========================\n')
        fnesl.flush()
        i=j  

    fnesl.write(u'=========================*=*=*=*=\n')
    fnesl.flush()
    kolotch2=1000
    perevod.perev_pech_nezsl(slov,vhjaz,vihjaz,osnpo,osvapo,fnesl,kolotch2)

    fnesl.write(u'\n\n')
    fnesl.flush()

    ssnp=[]
    for t, n in osnpo.iteritems():
        ssnp.append((n,t))

    ssnp.sort(reverse=True)
    i=0
    for n, t in ssnp:
        if len(t)>=4 and t.startswith(u'Un'):
            t1=t[2:]
#            prist=u'Mal'
            prist=u'Ne'
        elif len(t)>=4 and (t.startswith(u'Ill') or
                          t.startswith(u'Imb') or 
                          t.startswith(u'Imm') or 
                          t.startswith(u'Imp') or 
                          t.startswith(u'Irr') or 
                          t.startswith(u'In') ):
            t1=t[2:]
            prist=u'Ne'
        else:
            continue

        (t2,stat2)=perevod.perevod_stat(slov,vhjaz,vihjaz,t1)
        if len(t2)>=6 and t2[0]==u'(' and t2[len(t2)-1]==u')':
            t2=t2[1:len(t2)-1]
        t2t=t2.split(u', ')
        t2r=u''
        i=0
        while i<len(t2t):
            t2r+=prist+t2t[i]
            i+=1
            if i>=len(t2t):
                break
            t2r+=u', '

        fnesl.write(u'%s : %s\n'%(t,t2r))
        fnesl.flush()



if __name__ == "__main__":
    try:
        vhjaz = wikipedia.argHandler(sys.argv[1], 'perevod')
        vihjaz = wikipedia.argHandler(sys.argv[2], 'perevod')
        fslov = wikipedia.argHandler(sys.argv[3], 'perevod')
        fvh = wikipedia.argHandler(sys.argv[4], 'perevod')
        fipref = wikipedia.argHandler(sys.argv[5], 'perevod')
        fi = wikipedia.argHandler(sys.argv[6], 'perevod')
        fvih= fi+u'.txt'
        fotch = fi+u'-ot.txt'
        fipoln = fi+u'-pp.txt'
        finesl = fi+u'-ns.txt'
        finka = fi+u'-nk.txt'
        finiz = fi+u'-iz.txt'
        fiuzxe = fi+u'-uz.txt'

        fl_d_iwi_n=0
        if len(sys.argv)>=8 and (u'w' in sys.argv[7]):
            fl_d_iwi_n=1
        fl_b_tekst=0
        if len(sys.argv)>=8 and (u'k' in sys.argv[7]):
            fl_b_tekst=1
        fl_schss=0
        if len(sys.argv)>=8 and (u's' in sys.argv[7]):
            fl_schss=1
        elif len(sys.argv)>=8 and (u'c' in sys.argv[7]):
            fl_schss=2

        otch = codecs.open(fotch, 'w', 'utf-8')
        #otch2 = codecs.open(fotch2, 'w', 'utf-8')
        slov={}
        nssvoj={}
        nscom={}
        zapriz={}
        slovdop={}
        perevod.zagruzslov(slov,slovdop,nssvoj,vhjaz,vihjaz,fslov)

        if slovdop.has_key(u'c'):
            nscom=slovdop[u'c']
        if slovdop.has_key(u'z'):
            zapriz=slovdop[u'z']

        sp_obr_st={}
        if slovdop.has_key(u'spst'):
            sp_obr_st=slovdop[u'spst']
        sp_obr_uzxe={}
        if slovdop.has_key(u'pe_uzxe'):
            sp_obr_uzxe=slovdop[u'pe_uzxe']
        for t in sp_obr_uzxe:
            if sp_obr_st.has_key(t):
                del sp_obr_st[t]
        main(slov,nscom,zapriz,sp_obr_st,vhjaz,vihjaz,
                            fvh,fvih,finesl,finka,finiz,fiuzxe,fipref,
                            fl_d_iwi_n,fl_b_tekst,fl_schss)
    except:
        wikipedia.stopme()
        raise
    else:
        wikipedia.stopme()

* perev_tit.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
import perevod

def iskat(t,s):
    lt=len(t)
    ls=len(s)
    i=0
    while i<=lt-ls:
        if t[i:i+ls]==s:
            return i
        i+=1
    return -1

def iskats_mn(t,p0,ss):
    i=p0
    while i<len(t):
        for j in range(len(ss)):
            if t[i].startswith(ss[j]):
                return (i,j)
        i+=1
    return (-1,0)

def perv_upper(t):
    if len(t)<1:
        return u''
    return t[0].upper()+t[1:]

def ubr_nk_prob(t):
    i=0
    while i<len(t) and ((t[i]==u' ') or (t[i]==u'\n') or (t[i]==u'\t') or
                        (t[i]==u'\r')):
        i+=1
    j=len(t)-1
    while j>i and ((t[j]==u' ') or (t[j]==u'\n') or (t[j]==u'\t') or
                        (t[j]==u'\r')):
        j-=1
    return t[i:j+1]


def main(slov,nssvoj,vhjaz,vihjaz,fvh,fvih,fipoln,finesl,fispit,flvsjo):

    otch.write(u'len(nssvoj)=%d\n'%len(nssvoj))
    otch.flush()

    kolotch=1500
    f0=codecs.open(fvh,'rb',encoding='utf-8')
    f1=codecs.open(fvih, 'w', 'utf-8')
    fpoln=codecs.open(fipoln, 'w', 'utf-8')
    fnesl=codecs.open(finesl, 'w', 'utf-8')
    fspit=codecs.open(fispit, 'w', 'utf-8')
    osnp={}
    osvap={}
    for s in f0.readlines():
#        wikipedia.output(u'%d\n' % ord(s[0]))
        if ord(s[0]) == 65279:
            s=s[1:]
#        wikipedia.output(u'%d %d  %s   %s\n' % (i,j,n,s))
        s=s.replace(u'\r',u'').replace(u'_',u' ')
#        if s[len(s)-1]!=u'\n':
#            s=s+u'\n'

        s=ubr_nk_prob(s)

#        (rez,fl_perev)=perevod(slov,vhjaz,vihjaz,s)
        (rez,stat)=perevod.perevod_stat(slov,vhjaz,vihjaz,s)


        if flvsjo or (stat.nnp==0 and stat.nvap==0):
            f1.write(s+u'|'+rez+u'\n')
            f1.flush()
            if nssvoj.has_key(rez):
                otch.write(rez+u'\n')
                otch.write(u' <- uzxe\n')
                otch.flush()
            else:
                fspit.write(s+u'\n')
                fspit.flush()

        fpoln.write(rez+u'\n')
        fpoln.flush()

        perevod.perev_uch_nezsl(osnp,osvap,stat)

    perevod.perev_pech_nezsl(slov,vhjaz,vihjaz,osnp,osvap,fnesl,kolotch)


if __name__ == "__main__":
    try:
        vhjaz = wikipedia.argHandler(sys.argv[1], 'perevod')
        vihjaz = wikipedia.argHandler(sys.argv[2], 'perevod')
        fslov = wikipedia.argHandler(sys.argv[3], 'perevod')
        fvh = wikipedia.argHandler(sys.argv[4], 'perevod')
        fi = wikipedia.argHandler(sys.argv[5], 'perevod')
        fvih= fi+u'.txt'
        fotch = fi+u'-ot.txt'
        fipoln = fi+u'-pp.txt'
        finesl = fi+u'-ns.txt'
        fispit = fi+u'-st.txt'

        flvsjo=0
        if len(sys.argv)>=7 and sys.argv[6]==u'2':
            flvsjo=1

        otch = codecs.open(fotch, 'w', 'utf-8')
        #otch2 = codecs.open(fotch2, 'w', 'utf-8')
        slov={}
        nssvoj={}
        slovdop={}
        perevod.zagruzslov(slov,slovdop,nssvoj,vhjaz,vihjaz,fslov)
        main(slov,nssvoj,vhjaz,vihjaz,fvh,fvih,fipoln,finesl,fispit,flvsjo)
    finally:
        wikipedia.stopme()

* vich_slivi.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
from bib_tekst import *

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()


def main():
    n=[]
    f=codecs.open(filename,'rb',encoding='utf-8')
    f1=codecs.open(fi1,'rb',encoding='utf-8')
    global otch
    otch = codecs.open(fotch, 'w', 'utf-8')
    frez = codecs.open(firez, 'w', 'utf-8')

    ns={}
    for s in f1.readlines():
        if ord(s[0]) == 65279:
            s=s[1:]
        s=ubr_nk_prob(s)
        if len(s)>0 and not s.startswith(vhjaz+u':'):
            ns[s]=1

    sch={}
    for s in f.readlines():
        if ord(s[0]) == 65279:
            s=s[1:]
        s=ubr_nk_prob(s)
        if ns.has_key(s):
            vivod(u'- %s\n'%s)
        else:
            frez.write(s+u'\n')
            frez.flush()

#fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
#filename = fi+'.txt'
#fotch2 = fi+'_op.txt'

vhjaz = wikipedia.argHandler(sys.argv[1], 'cht_cat')
fi1 = wikipedia.argHandler(sys.argv[2], 'cht_cat')
filename = wikipedia.argHandler(sys.argv[3], 'cht_cat')
fi = wikipedia.argHandler(sys.argv[4], 'cht_cat')
firez = fi+'.txt'
fotch = fi+'_ot.txt'

#otch2 = codecs.open(fotch2, 'w', 'utf-8')
mysite = wikipedia.getSite()

try:
    main()
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()

* zamen_slivi.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
from bib_tekst import *

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()


def main():
    n=[]
    f=codecs.open(filename,'rb',encoding='utf-8')
    f1=codecs.open(fi1,'rb',encoding='utf-8')
    global otch
    otch = codecs.open(fotch, 'w', 'utf-8')
    frez = codecs.open(firez, 'w', 'utf-8')

    ns={}
    for s in f1.readlines():
        if ord(s[0]) == 65279:
            s=s[1:]
        s=ubr_nk_prob(s)
        st=s.split(u'|')
        if len(st)==2:
            ns[vhjaz+u':'+ubr_nk_prob(st[0])]=vhjaz+u':'+ubr_nk_prob(st[1])

    sch={}
    for s in f.readlines():
        if ord(s[0]) == 65279:
            s=s[1:]
        s=ubr_nk_prob(s)
        i=0
        while ns.has_key(s):
            if sch.has_key(s):
                sch[s]+=1
            else:
                sch[s]=1
            s=ns[s]
            i=i+1
            if i>=10:
                vivod(u'!cikl %s\n'%s)
                break

        frez.write(s+u'\n')
        frez.flush()
    for ss,sn in ns.iteritems():
        n=0
        if sch.has_key(ss):
            n=sch[ss]
        vivod(u'%d %s | %s\n'%(n,ss,sn))


#fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
#filename = fi+'.txt'
#fotch2 = fi+'_op.txt'

vhjaz = wikipedia.argHandler(sys.argv[1], 'cht_cat')
filename = wikipedia.argHandler(sys.argv[2], 'cht_cat')
fi1 = wikipedia.argHandler(sys.argv[3], 'cht_cat')
fi = wikipedia.argHandler(sys.argv[4], 'cht_cat')
firez = fi+'.txt'
fotch = fi+'_ot.txt'

#otch2 = codecs.open(fotch2, 'w', 'utf-8')
mysite = wikipedia.getSite()

try:
    main()
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()

* podg_mma_f0_v2.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
import xmlreader
from bib_tekst import *


def ubrkoment (text):
    # Ignore tekst within nowiki tags and HTML comments
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
    match = nowikiOrHtmlCommentR.search(text)
    while match:
        text = text[:match.start()] + text[match.end():]    
        match = nowikiOrHtmlCommentR.search(text)
    return text

def korr_im (jaz,t,pref):

    for p in pref:
        if t.startswith(p):
            t=t[len(p):]
            break

    t=perv_upper(t)

    if u':' in t:
        if ( (not t.startswith(u'Category:')) and 
                     (not t.startswith(u'Image:')) ):
            katprefi=mysite.family.category_namespace(jaz)
            if not t.startswith(katprefi+u':'):
                izprefi=mysite.family.image_namespace(jaz)
                if not t.startswith(izprefi+u':'):
                    return u''
    return t

iskkat=[
    u'[[Kategorio:Naskiĝ',
#    u'[[kategorio:Naskiĝ',
    u'[[Kategorio:naskiĝ',
#    u'[[kategorio:naskiĝ',
    u'[[Kategorio:Mort',
#    u'[[kategorio:Mort',
    u'[[Kategorio:mort',
#    u'[[kategorio:mort',
       ]

def main(vhjaz,fvh,fvih,fipref):
    tzfl=0
    tzst={}
    tzno={}


    pref=[]
    if fipref!=u'':
        fpref=codecs.open(fipref,'rb',encoding='utf-8')
        for s in fpref.readlines():
            if ord(s[0]) == 65279:
                s=s[1:]
            s=s.replace(u'\r',u'')
            if s[len(s)-1]==u'\n':
                s=s[:len(s)-1]
            pref.append(s)
        fpref.close()


    n=u''
#    f0=codecs.open(fvh,'rb',encoding='utf-8')
    f1=codecs.open(fvih, 'w', 'utf-8')

    insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')

    tblredir = {}
    # open xml dump and read page titles out of it
    dump = xmlreader.XmlDump(fvh)
    redirR = wikipedia.getSite().redirectRegex()
    readPagesCount = 0

    for entry in dump.parse():
        readPagesCount += 1
        # always print status message after 1000 pages
        if readPagesCount % 1000 == 0:
            print '%i pages read...' % readPagesCount
        m = redirR.search(entry.text)
        if (not m) and (not u':' in entry.title):
            t=entry.text.replace(u'[[ ',u'[[').replace(u': ',u':')
            t=t.replace(u'[[k',u'[[K')
            dop=0
            for isk in iskkat:
                if isk in t:
                    dop=1
                    break
            if dop:
                b=u'%s\n' % entry.title
                f1.write(b)
                f1.flush()

vhjaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')

fpref=u''

#fotch = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')

#mysite = wikipedia.getSite()
#otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite()

try:
    main(vhjaz,fvh,fvih,fpref)
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()

</nowiki>

* podg_mma_f1.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
import xmlreader
from bib_tekst import *


def ubrkoment (text):
    # Ignore tekst within nowiki tags and HTML comments
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
    match = nowikiOrHtmlCommentR.search(text)
    while match:
        text = text[:match.start()] + text[match.end():]    
        match = nowikiOrHtmlCommentR.search(text)
    return text

def korr_im (jaz,t,pref):

    for p in pref:
        if t.startswith(p):
            t=t[len(p):]
            break

    t=perv_upper(t)

    if u':' in t:
        if ( (not t.startswith(u'Category:')) and 
                     (not t.startswith(u'Image:')) ):
            katprefi=mysite.family.category_namespace(jaz)
            if not t.startswith(katprefi+u':'):
                izprefi=mysite.family.image_namespace(jaz)
                if not t.startswith(izprefi+u':'):
                    return u''
    return t


def main(vhjaz,fvh,fvih,fipref):
    tzfl=0
    tzst={}
    tzno={}


    pref=[]
    if fipref!=u'':
        fpref=codecs.open(fipref,'rb',encoding='utf-8')
        for s in fpref.readlines():
            if ord(s[0]) == 65279:
                s=s[1:]
            s=s.replace(u'\r',u'')
            if s[len(s)-1]==u'\n':
                s=s[:len(s)-1]
            pref.append(s)
        fpref.close()


    n=u''
#    f0=codecs.open(fvh,'rb',encoding='utf-8')
    f1=codecs.open(fvih, 'w', 'utf-8')

    insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')

    tblredir = {}
    # open xml dump and read page titles out of it
    dump = xmlreader.XmlDump(fvh)
    redirR = wikipedia.getSite().redirectRegex()
    readPagesCount = 0

    for entry in dump.parse():
        readPagesCount += 1
        # always print status message after 10000 pages
        if readPagesCount % 10000 == 0:
            print '%i pages read...' % readPagesCount
        m = redirR.search(entry.text)
        if m:
            target = m.group(1)
            # There might be redirects to another wiki. Ignore these.
            for code in wikipedia.getSite().family.langs.keys():
                if target.startswith('%s:' % code) or target.startswith(':%s:' % code):
                    wikipedia.output(u'NOTE: Ignoring %s which is a redirect to %s:' % (entry.title, code))
                    target = None
                    break
            # if the redirect does not link to another wiki
            if target:
                target = target.replace('_', ' ')
                # remove leading and trailing whitespace
                target = target.strip()
                # capitalize the first letter
                if not wikipedia.getSite().nocapitalize:
                    target = target[0].upper() + target[1:]
                if '#' in target:
                    target = target[:target.index('#')]
                if '|' in target:
                    wikipedia.output(u'HINT: %s is a redirect with a pipelink.' % entry.title)  
                    target = target[:target.index('|')]
#                tblredir[entry.title] = target
#                b=u'%s|%s\n' % (entry.title, target)
                tt=entry.title
                tc=target
                tts=tt.split(u' ')
                tcs=tc.split(u' ')
                ltt=len(tts)
                ltc=len(tcs)
                if ltt==ltc and ltt>=2 and tt.lower()==tc.lower():
                    dop=1
                    for i in range(ltt):
                        if tts[i]!=tcs[i]:
                            if len(tts[i])!=len(tcs[i]):
                                dop=0
                            for j in range(len(tts[i])):
                                if (tts[i][j]==tts[i][j].lower() and
                                      tcs[i][j]!=tcs[i][j].lower()):
                                    dop=0
#                        for j in range(len(tcs[i])):
#                            if j>0 and tcs[i][j]!=tcs[i][j].lower():
#                                dop=0
                    b=u'%d|%s|%s\n' % (dop,tt, tc)
                    f1.write(b)
                    f1.flush()

vhjaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')

fpref=u''
if len(sys.argv)>=5:
    fpref = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')


#fotch = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')

#mysite = wikipedia.getSite()
#otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite()

try:
    main(vhjaz,fvh,fvih,fpref)
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()

</nowiki>

* podg_mma_f1_v2.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
import xmlreader
from bib_tekst import *


def ubrkoment (text):
    # Ignore tekst within nowiki tags and HTML comments
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
    match = nowikiOrHtmlCommentR.search(text)
    while match:
        text = text[:match.start()] + text[match.end():]    
        match = nowikiOrHtmlCommentR.search(text)
    return text

def korr_im (jaz,t,pref):

    for p in pref:
        if t.startswith(p):
            t=t[len(p):]
            break

    t=perv_upper(t)

    if u':' in t:
        if ( (not t.startswith(u'Category:')) and 
                     (not t.startswith(u'Image:')) ):
            katprefi=mysite.family.category_namespace(jaz)
            if not t.startswith(katprefi+u':'):
                izprefi=mysite.family.image_namespace(jaz)
                if not t.startswith(izprefi+u':'):
                    return u''
    return t


def main(vhjaz,fvh,fvih,fipref):
    tzfl=0
    tzst={}
    tzno={}

    tblz={}

    pref=[]
    if 1:
        fpref=codecs.open(fipref,'rb',encoding='utf-8')
        for s in fpref.readlines():
            if ord(s[0]) == 65279:
                s=s[1:]
            s=ubr_nk_prob(s)
            tblz[s]=1
        fpref.close()


    n=u''
#    f0=codecs.open(fvh,'rb',encoding='utf-8')
    f1=codecs.open(fvih, 'w', 'utf-8')

    insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')

    tblredir = {}
    # open xml dump and read page titles out of it
    dump = xmlreader.XmlDump(fvh)
    redirR = wikipedia.getSite().redirectRegex()
    readPagesCount = 0

    for entry in dump.parse():
        readPagesCount += 1
        # always print status message after 1000 pages
        if readPagesCount % 1000 == 0:
            print '%i pages read...' % readPagesCount
        m = redirR.search(entry.text)
        if m:
            target = m.group(1)
            # There might be redirects to another wiki. Ignore these.
            for code in wikipedia.getSite().family.langs.keys():
                if target.startswith('%s:' % code) or target.startswith(':%s:' % code):
                    wikipedia.output(u'NOTE: Ignoring %s which is a redirect to %s:' % (entry.title, code))
                    target = None
                    break
            # if the redirect does not link to another wiki
            if target:
                target = target.replace('_', ' ')
                # remove leading and trailing whitespace
                target = target.strip()
                # capitalize the first letter
                if not wikipedia.getSite().nocapitalize:
                    target = target[0].upper() + target[1:]
                if '#' in target:
                    target = target[:target.index('#')]
                if '|' in target:
                    wikipedia.output(u'HINT: %s is a redirect with a pipelink.' % entry.title)  
                    target = target[:target.index('|')]
#                tblredir[entry.title] = target
#                b=u'%s|%s\n' % (entry.title, target)
                tt=entry.title
                tc=target
                ttk=tt.replace(u'-',u' ').replace(u'.',u' ').replace(u'(',u' ')
                tck=tc.replace(u'-',u' ').replace(u'.',u' ').replace(u'(',u' ')
                tts=ttk.split(u' ')
                tcs=tck.split(u' ')
                ltt=len(tts)
                ltc=len(tcs)
                if tblz.has_key(tc) and ltt>=2:
                    dop=1
                    dop2=0
                    for i in range(ltt):
                        if len(tts[i])>=2 and tts[i][1:]!=tts[i][1:].lower():
                            dop2=1
                    for i in range(ltc):
#                        if (  ( len(tcs[i])>2 or 
#                               (len(tcs[i])==2 and tcs[i][1]!=u'.') ) and 
#                                     tcs[i]==tcs[i].upper()  ):
                        if len(tcs[i])>=2 and tcs[i][1:]!=tcs[i][1:].lower():
                            dop=0
                    if dop2==0:
                        dop=0
                    b=u'%d|%s|%s\n' % (dop,tt,tc)
                    f1.write(b)
                    f1.flush()

vhjaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
fpref = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')


#fotch = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')

#mysite = wikipedia.getSite()
#otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite()

try:
    main(vhjaz,fvh,fvih,fpref)
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()

</nowiki>

* podg_mma_f2.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
import xmlreader
from bib_tekst import *


def ubrkoment (text):
    # Ignore tekst within nowiki tags and HTML comments
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
    match = nowikiOrHtmlCommentR.search(text)
    while match:
        text = text[:match.start()] + text[match.end():]    
        match = nowikiOrHtmlCommentR.search(text)
    return text

def korr_im (jaz,t,pref):

    for p in pref:
        if t.startswith(p):
            t=t[len(p):]
            break

    t=perv_upper(t)

    if u':' in t:
        if ( (not t.startswith(u'Category:')) and 
                     (not t.startswith(u'Image:')) ):
            katprefi=mysite.family.category_namespace(jaz)
            if not t.startswith(katprefi+u':'):
                izprefi=mysite.family.image_namespace(jaz)
                if not t.startswith(izprefi+u':'):
                    return u''
    return t


def main(vhjaz,fvh,fvih,fipref):
    tzfl=0
    tzst={}
    tzno={}

    tblz={}

    pref=[]
    if 1:
        fpref=codecs.open(fipref,'rb',encoding='utf-8')
        for s in fpref.readlines():
            if ord(s[0]) == 65279:
                s=s[1:]
            s=s.replace(u'\r',u'')
            if s[len(s)-1]==u'\n':
                s=s[:len(s)-1]
            st=s.split(u'|')
            if len(st)==3 and st[0]==u'1':
                tblz[st[1]]=st[2]
        fpref.close()


    n=u''
#    f0=codecs.open(fvh,'rb',encoding='utf-8')
    f1=codecs.open(fvih, 'w', 'utf-8')

    insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')

    tblredir = {}
    # open xml dump and read page titles out of it
    dump = xmlreader.XmlDump(fvh)
    redirR = wikipedia.getSite().redirectRegex()
    readPagesCount = 0

    sch_str=0
    sch_zam=0

    for entry in dump.parse():
        readPagesCount += 1
        # always print status message after 1000 pages
        if readPagesCount % 1000 == 0:
            print '%i pages read...' % readPagesCount
        m = redirR.search(entry.text)
        if m:
            pass
        else:
            if entry.title.startswith(u'Vikipedio:'):
                continue
            fperv=1
            for tt, tc in tblz.iteritems():
                if ( (not entry.title.startswith(tt)) and 
                     (not entry.title.startswith(tc)) and (tt in entry.text) ):

                    if fperv:
                        b=u'|malmajuskligo\n%s\n'%entry.title
                        f1.write(b)
                        f1.flush()
                        sch_str+=1
                    b=((u'?=======\n1l\n-=======\n%s\n+=======\n%s\n')%(tt,tc))
                    f1.write(b)
                    f1.flush()
                    fperv=0
                    sch_zam+=1

            if fperv==0:
                f1.write(u'========================\n')
                f1.flush()

    wikipedia.output(u'sch_str=%d sch_zam=%d'%(sch_str,sch_zam))


vhjaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')

fpref = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')

#fotch = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')

#mysite = wikipedia.getSite()
#otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite()

try:
    main(vhjaz,fvh,fvih,fpref)
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()

</nowiki>

* svoj-ka.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia, pagegenerators
import re, sys
import codecs
import perevod,imagetransfer1

def iskat(t,s):
    lt=len(t)
    ls=len(s)
    i=0
    while i<=lt-ls:
        if t[i:i+ls]==s:
            return i
        i+=1
    return -1

def iskats(t,i,s):
    lt=len(t)
    ls=len(s)
    while i<=lt-ls:
        if t[i:i+ls]==s:
            return i
        i+=1
    return -1

def iskato(t,i,s):
    ls=len(s)
    while i>=0:
        if t[i:i+ls]==s:
            return i
        i-=1
    return -1

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()

def ubr_nk_prob(t):
    if len(t)<1:
        return t
    while (t[0]==u' ') or (t[0]==u'\n') or (t[0]==u'\t') or (t[0]==u'\r'):
        t=t[1:]
        if len(t)<1:
            return t
    while (t[len(t)-1]==u' ') or (t[len(t)-1]==u'\n') or (t[len(t)-1]==u'\t') or (t[len(t)-1]==u'\r'):
        t=t[:len(t)-1]
        if len(t)<1:
            return t
    return t

    otch.write(u'\n\n\n')
    otch.flush()
    zapis_fimen(fimen,nomerf,pl)
    return


def main():

    mysite = wikipedia.getSite()
#    sico=wikipedia.getSite(code = u'commons',fam = u'commons')

    global otch
    otch = codecs.open(u'%s' % filename, 'w', 'utf-8')

#    for page in mysite.allpages(start = u'', namespace = 6):
#        vivod(page.title()+u'\n')
    for page in mysite.allpages(start = u'', namespace = 14):
        vivod(page.title()+u'\n')

filename = wikipedia.argHandler(sys.argv[1], 'cht_cat')

for arg in sys.argv[2:]:
    arg = wikipedia.argHandler(arg, 'ivsen')
    if arg:
        if arg=="-log":
            import logger
            sys.stdout = logger.Logger(sys.stdout, filename = 'com-izobr.log')

try:
    main()
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()

* isk_ssilk.py

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
from bib_tekst import *

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()

def isk_kateg(n):
    pl = wikipedia.Page(mysite,n)
    wikipedia.output(pl.title())

    try:
        ot = pl.get()
    except wikipedia.NoPage:
        b = u'- %s\n\n' % pl.title()
        vivod(b)
        return
    except wikipedia.IsRedirectPage:
        b = u'+ %s\n-> %s\n\n' % (pl.title(),pl.getRedirectTarget())
        vivod(b)
        return

    otch.write(u'+ %s\n' % pl.title())
    otch.flush()

    otch2.write(u'=========================\n%s\n-=======\n%s\n' % (pl.title(),ot))
    otch2.flush()

    katprefi=mysite.family.category_namespace(mysite.lang)

    lip=pl.linkedPages()
    for pl2 in lip:
        t=pl2.title()
#        if t.startwith(katprefi+u':') or t.startwith(u':'+katprefi+u':'):
        frez.write(t+u'\n')

def main():
    n=[]
    f=codecs.open(filename,'rb',encoding='utf-8')

    i=0
    j=0
    for s in f.readlines():
        if ord(s[0]) == 65279:
            s=s[1:]
        s=s.replace(u'\r',u'')
        if s[len(s)-1]==u'\n':
            s=s[:len(s)-1]
        isk_kateg(s)


fi = wikipedia.argHandler(sys.argv[1], 'cht_cat')
filename = fi+'.txt'
fotch = fi+'_ot.txt'
fotch2 = fi+'_op.txt'
firez = fi+'_sk.txt'

#filename = wikipedia.argHandler(sys.argv[1], 'cht_cat')
#fotch = wikipedia.argHandler(sys.argv[2], 'cht_cat')
#fotch2 = wikipedia.argHandler(sys.argv[3], 'cht_cat')

fliwi=1
if len(sys.argv)>=3 and sys.argv[2]==u'w':
    fliwi=0

otch = codecs.open(fotch, 'w', 'utf-8')
otch2 = codecs.open(fotch2, 'w', 'utf-8')
frez = codecs.open(firez, 'w', 'utf-8')
mysite = wikipedia.getSite()

try:
    main()
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()