Python Tutorial

Friday, November 2, 2012

Beautiful Soup find_all() search API


find_all() is the most popular method in the Beautiful Soup search API. It's reduce your code size massively. We can use regular expression, custom function into it. I used this html file for practice.
All source code available on github

 
from pprint import pprint
import re
from bs4 import BeautifulSoup

html_content = open('bs_sample.html') 
#http://dl.dropbox.com/u/49962071/blog/python/resource/bs_sample.html
soup = BeautifulSoup(html_content) # making soap

for tag in soup.find_all(re.compile("^p")): # find all tag start with p
    print tag.name

for tag in soup.find_all(re.compile("t")): # find all tag contains t
    print tag.name

for tag in soup.find_all(True): # find all tag
    print tag.name

pprint(soup.find_all('a')) # find all a tag
print 20*"++"
pprint(soup.find_all(["a", "b"])) # find multiple tag


def has_class_but_no_id(tag):
    return tag.has_key('class') and not tag.has_key('id')

pprint(soup.find_all(has_class_but_no_id)) 
# pass a function to find_all

pprint(soup.find_all(text=re.compile("sisters"))) 
# find all tag content contains key 'sisters'
print 20*"++"
pprint(soup.find_all(href=re.compile("my_url"))) # all links contains key "my_url"
pprint(soup.find_all(id=True)) # all links has id
pprint(soup.find_all(class_=True)) # all links has class

def has_six_characters(css_class):
    return css_class is not None and len(css_class) == 7

pprint(soup.find_all(class_=has_six_characters)) 
# find all class name contains 7 characters

pprint(soup.find_all("a", "sister")) # find all a tag have class named 'sister'
pprint(soup.find_all("a", re.compile("sister"))) 
# find all a tag have class named contains 'sister'
print 20*"++"

pprint(soup.find_all(href=re.compile("elsie"), id='link1'))
# url name contains elsie and have id = link1
pprint(soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'})) 
# url name contains elsie and have id = link1

pprint(soup.find_all("a", limit=2)) # use limit on find_all

pprint(soup.html.find_all("title", recursive=True)) # use recursive on findall


2 comments:

Post a Comment