Из примечаний: можно поменять значение переменной APODUrl на какое-нибудь зеркало (список можно найти здесь).
import os
import os.path
import sys
import HTMLParser
import re
import urllib
from httplib import *
from HTMLParser import HTMLParser
APODUrl = "antwrp.gsfc.nasa.gov"
COUNT100617 = 5477
HREF = ""
class APODArchiveParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.href = []
self.canAppend = False
def handle_starttag(self, tag, attrs):
if tag == 'a':
attrs = dict(attrs)
if ('href' in attrs):
if (len(re.findall(r"ap\d{6}.html", attrs['href'])) > 0):
if HREF == "":
self.canAppend = True
elif attrs['href'] == HREF:
self.canAppend = True
if self.canAppend:
self.href.append(attrs['href'])
class APODParser(HTMLParser):
imageURL = ""
def __init__(self):
HTMLParser.__init__(self)
self.imageURL = ""
def handle_starttag(self, tag, attrs):
if tag == 'a':
attrs = dict(attrs)
if ('href' in attrs):
if (len(re.findall(r"^image/*", attrs['href'])) > 0):
self.imageURL = attrs['href']
logfile = "APODlog.txt"
if os.path.isfile(logfile):
log = open(logfile, 'r')
HREF = log.readline()
if HREF[-1] == '\n':
HREF = HREF[0:-1]
log.close()
apodArchive = APODArchiveParser()
conn = HTTPConnection(APODUrl)
conn.request("GET", "/apod/archivepix.html")
data = conn.getresponse().read()
apodArchive.feed(data)
apodArchive.close()
apodParser = APODParser()
DIR = ""
if len(sys.argv) > 1:
DIR = sys.argv[1]
if DIR[-1] != "/":
DIR = DIR + "/"
i = 0
count = len(apodArchive.href)
sys.stdout.write("\r%d/%d" % (i, count))
sys.stdout.flush()
for url in apodArchive.href:
i += 1
conn = HTTPConnection(APODUrl)
conn.request("GET", "/apod/%s" % (url))
log = open(logfile, 'w')
log.write(url)
apodParser.imageURL = ""
data = conn.getresponse().read()
try:
apodParser.feed(data)
except BaseException, e:
sys.stdout.write("\n Parser exception on page http://%s/apod/%s !\n" % (APODUrl, url))
sys.stdout.flush()
apodParser.reset()
log.close()
if apodParser.imageURL != "":
tmp = apodParser.imageURL.split("/")
filename = tmp[-2] + "/" + tmp[-1]
if not os.path.isdir(DIR+tmp[-2]):
os.makedirs(DIR+tmp[-2])
if not os.path.isfile(DIR+filename):
urllib.urlretrieve ("http://%s/%s" % (APODUrl, apodParser.imageURL), DIR+filename)
if os.path.isfile(DIR+filename):
sys.stdout.write("\r%d/%d" % (i, count))
sys.stdout.flush()
else:
sys.stdout.write('\n Error! Can not create file "%s".\n' % (DIR+filename))
sys.stdout.flush()
break
else:
sys.stdout.write("\n Parser error on page http://%s/apod/%s !\n" % (APODUrl, url))
sys.stdout.flush()
Комментариев нет:
Отправить комментарий