# Mirror the libsdl.org/cgi/docwiki.cgi documentation
# Copyright (C) 2008  Sylvain Beucler
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


# As http://libsdl.org/robots.txt prevents indexation, the SDL wiki
# doesn't appear in search engines. This mirror fixes that.


import os
import urllib
from xml.dom import minidom

# Grab index
urllib.urlretrieve('http://libsdl.org/cgi/docwiki.cgi/TitleIndex?action=titleindex&mimetype=text/xml',
                   '00_list.xml')

# Get a list of pages
xmldoc  = minidom.parse('00_list.xml')
# Grab '<Title>' nodes
#pages_nodes = xmldoc.firstChild.childNodes
pages_nodes = xmldoc.getElementsByTagName('Title')
# Grab page title strings
pages = [i.firstChild.data for i in pages_nodes]

# Download all pages
log = file('log', 'w');
for page in pages:
#test: for page in [u'SDL_Init']:
    page = page.encode('utf-8')
    #print page
    page = page.replace('_', '_5f')
    page = urllib.pathname2url(page)
    filename = page.replace('/', '_')
    filename = "pages.new/%s" % filename
    head, tail = os.path.split(filename)
    if head and not os.path.isdir(head):
       os.makedirs(head)
    # append '?action=raw' to get the source
    url = "http://libsdl.org/cgi/docwiki.cgi/%s" % page
    log.write(url + "\n")
    log.flush()
    urllib.urlretrieve(url, filename)
    time.sleep(2) # avoid triggering MoinMoin's surge protection
log.close()

