# Mirror the libsdl.org/cgi/docwiki.cgi documentation
# Copyright (C) 2008  Sylvain Beucler
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


# As http://libsdl.org/robots.txt prevents indexation, the SDL wiki
# doesn't appear in search engines. This mirror fixes that.

import os, time
import urllib
from xml.dom import minidom
import feedparser, stat, sys

if (len(sys.argv) == 2):
	mode = sys.argv[1]
else:
	sys.exit("Usage: %s build|refresh" % sys.argv[0])


data_path = '../data/pages.orig'

if (mode == 'build'):
	# Create folder to store all pages
	try:
		os.makedirs(data_path)
	except OSError:
		pass
	
	# Force browsers to read pages as HTML, encoded with UTF-8
	# (instead of plain-text + server sitewide encoding)
	htaccess = file(data_path + '/.htaccess', 'w');
	htaccess.write('ForceType text/html;charset=UTF-8\n')
	htaccess.close()
	
	
	# Grab index
	urllib.urlretrieve('http://libsdl.org/cgi/docwiki.cgi/TitleIndex?action=titleindex&mimetype=text/xml',
	                   '00_list.xml')
	
	# Get a list of pages
	xmldoc  = minidom.parse('00_list.xml')
	# Grab '<Title>' nodes
	#pages_nodes = xmldoc.firstChild.childNodes
	pages_nodes = xmldoc.getElementsByTagName('Title')
	# Grab page title strings
	pages = [i.firstChild.data for i in pages_nodes]
elif (mode == 'refresh'):
	f = feedparser.parse('http://libsdl.org/cgi/docwiki.cgi/RecentChanges?action=rss_rc&unique=1&items=100')
	pages = []
	stamp_filename = 'stamp'
	if os.path.exists(stamp_filename):
		stamp_mtime = os.stat(stamp_filename)[stat.ST_MTIME]
	else:
		stamp_mtime = time.time() - 24*60*60 # yesterday
	for entry in f.entries:
		if (time.mktime(entry.updated_parsed) > stamp_mtime):
			pages.append(entry.title)
	pages.append('RecentChanges')

	# Touch stamp
	f = file(stamp_filename, 'w')
	f.close()
else:
	sys.exit("Invalid mode '%s'" % mode)

#
# Download all pages
#
log = file('log', 'w');
for page in pages:
#test: for page in [u'SDL_Init']:
    page = page.encode('utf-8')
    page = urllib.pathname2url(page)
    filename = page.replace('/', '_') # equivalent in MoinMoin
    filename = data_path + '/' + filename
    # append '?action=raw' to get the source
    url = "http://libsdl.org/cgi/docwiki.cgi/%s" % page
    log.write(url + "\n")
    log.flush()
    urllib.urlretrieve(url, filename)
    time.sleep(2) # avoid triggering MoinMoin's surge protection
log.close()

