I had to retrieve a list of repositories from my mirror. It was just work for python-lxml library!
from __future__ import print_function # only with python2 from lxml import html as lhtml from urllib import urlopen baseurl = 'http://my.mirror/path/' html = lhtml.parse(urlopen(baseurl)) # get something like ... folders = html.findall('//td/a') header = folders.pop(0), # python 3 supports: header, *folders = html.findall('//td/a') for f in folders: print(baseurl, f.attrib['href'], sep="/")