from pprint import pprint import re from bs4 import BeautifulSoup html_content = open('bs_sample.html') #http://dl.dropbox.com/u/49962071/blog/python/resource/bs_sample.html soup = BeautifulSoup(html_content) # making soap for tag in soup.find_all(re.compile("^p")): # find all tag start with p print tag.name for tag in soup.find_all(re.compile("t")): # find all tag contains t print tag.name for tag in soup.find_all(True): # find all tag print tag.name pprint(soup.find_all('a')) # find all a tag print 20*"++" pprint(soup.find_all(["a", "b"])) # find multiple tag def has_class_but_no_id(tag): return tag.has_key('class') and not tag.has_key('id') pprint(soup.find_all(has_class_but_no_id)) # pass a function to find_all pprint(soup.find_all(text=re.compile("sisters"))) # find all tag content contains key 'sisters' print 20*"++" pprint(soup.find_all(href=re.compile("my_url"))) # all links contains key "my_url" pprint(soup.find_all(id=True)) # all links has id pprint(soup.find_all(class_=True)) # all links has class def has_six_characters(css_class): return css_class is not None and len(css_class) == 7 pprint(soup.find_all(class_=has_six_characters)) # find all class name contains 7 characters pprint(soup.find_all("a", "sister")) # find all a tag have class named 'sister' pprint(soup.find_all("a", re.compile("sister"))) # find all a tag have class named contains 'sister' print 20*"++" pprint(soup.find_all(href=re.compile("elsie"), id='link1')) # url name contains elsie and have id = link1 pprint(soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'})) # url name contains elsie and have id = link1 pprint(soup.find_all("a", limit=2)) # use limit on find_all pprint(soup.html.find_all("title", recursive=True)) # use recursive on findall
Friday, November 2, 2012
Beautiful Soup find_all() search API
find_all() is the most popular method in the Beautiful Soup search API. It's reduce your code size massively. We can use
regular expression, custom function into it. I used this html file for practice.
All source code available on github
Posted by
Abu Zahed Jony
at
11:30 PM
Email ThisBlogThis!Share to XShare to FacebookShare to Pinterest
Labels:
Beautiful Soup
Subscribe to:
Post Comments (Atom)
1 comments:
Brilliant !
Post a Comment