Parsing html with lxml

I had to retrieve a list of repositories from my mirror. It was just work for python-lxml library!

from __future__ import print_function # only with python2
from lxml import html as lhtml
from urllib import urlopen
baseurl = 'http://my.mirror/path/'

html = lhtml.parse(urlopen(baseurl))
# get something like  ...
folders = html.findall('//td/a') 
header = folders.pop(0),  # python 3 supports:  header, *folders = html.findall('//td/a')

for f in folders:
    print(baseurl, f.attrib['href'], sep="/")

Lascia un commento