from pprint import pprint
import re
from bs4 import BeautifulSoup
html_content = open('bs_sample.html')
#http://dl.dropbox.com/u/49962071/blog/python/resource/bs_sample.html
soup = BeautifulSoup(html_content) # making soap
for tag in soup.find_all(re.compile("^p")): # find all tag start with p
print tag.name
for tag in soup.find_all(re.compile("t")): # find all tag contains t
print tag.name
for tag in soup.find_all(True): # find all tag
print tag.name
pprint(soup.find_all('a')) # find all a tag
print 20*"++"
pprint(soup.find_all(["a", "b"])) # find multiple tag
def has_class_but_no_id(tag):
return tag.has_key('class') and not tag.has_key('id')
pprint(soup.find_all(has_class_but_no_id))
# pass a function to find_all
pprint(soup.find_all(text=re.compile("sisters")))
# find all tag content contains key 'sisters'
print 20*"++"
pprint(soup.find_all(href=re.compile("my_url"))) # all links contains key "my_url"
pprint(soup.find_all(id=True)) # all links has id
pprint(soup.find_all(class_=True)) # all links has class
def has_six_characters(css_class):
return css_class is not None and len(css_class) == 7
pprint(soup.find_all(class_=has_six_characters))
# find all class name contains 7 characters
pprint(soup.find_all("a", "sister")) # find all a tag have class named 'sister'
pprint(soup.find_all("a", re.compile("sister")))
# find all a tag have class named contains 'sister'
print 20*"++"
pprint(soup.find_all(href=re.compile("elsie"), id='link1'))
# url name contains elsie and have id = link1
pprint(soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'}))
# url name contains elsie and have id = link1
pprint(soup.find_all("a", limit=2)) # use limit on find_all
pprint(soup.html.find_all("title", recursive=True)) # use recursive on findall
Friday, November 2, 2012
Beautiful Soup find_all() search API
find_all() is the most popular method in the Beautiful Soup search API. It's reduce your code size massively. We can use
regular expression, custom function into it. I used this html file for practice.
All source code available on github
Posted by
Abu Zahed Jony
at
11:30 PM
Email ThisBlogThis!Share to XShare to FacebookShare to Pinterest
Labels:
Beautiful Soup
Subscribe to:
Post Comments (Atom)

1 comments:
Brilliant !
Post a Comment