Posted by Kosal
Sitemap is very important for ranking your website in any search engines. In this article I would like to show you the basic about how to create sitemap generator by using Python.
Requests is one of the most downloaded Python packages today. Requests allows you to send HTTP/1.1 requests extremely easily. There's no need to manually add query strings to your URLs, or to form-encode your PUT
& POST
data - but nowadays, just use the json
method!
To install requests from PyPI
pip install requests
Beautiful Soup is a library that makes it easy to scrape information from web pages. It sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree.
To install BeautifulSoup from PyPI
pip install beautifulsoup4
html5lib is a pure-python library for parsing HTML. It is designed to conform to the WHATWG HTML specification, as is implemented by all major web browsers.
To install html5lib from PyPI
pip install html5lib
clean()
to filter and clean the extracted linksdef clean(a_eles):
links = []
skip_links = []
for a in a_eles:
link = a['href']
if link.startswith('#') or link.startswith('mailto:') or link == '/':
skip_links.append(link)
continue
if link.startswith('/'):
link = '{}{}'.format(base_url, link)
if link.startswith('http://') != True and link.startswith('https://') != True:
link = '{}/{}'.format(base_url, link)
if link.startswith(base_url) is False:
continue
if link not in links:
links.append(link)
return [links, skip_links]
get_next_scan_urls()
filters out the URLs that haven't been scanned yet.def get_next_scan_urls(urls):
links = []
for u in urls:
if u not in scanned:
links.append(u)
return links
scan()
function performs the actual crawling by sending requests to each URL, extracting links, and recursively scanning those links if they haven't been scanned before.def scan(url):
if url not in scanned:
print('Scan url: {}'.format(url))
scanned.append(url)
data = requests.get(url)
soup = BeautifulSoup(data.text, 'html5lib')
a_eles = soup.find_all('a', href=True)
links, skip_links = clean(a_eles)
next_scan_urls = get_next_scan_urls(links)
print('Count next scan: {}'.format(len(next_scan_urls)))
if len(next_scan_urls) != 0:
for l in next_scan_urls:
scan(l)
return scanned
main()
function initiates the scanning process with the given website.def main():
links = scan(website)
urls = ''
for l in links:
urls += f"""
<url>
<loc>{l}</loc>
<lastmod>2022-07-27T02:24:08.242Z</lastmod>
<priority>0.6</priority>
</url>
"""
xml = f"""
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{urls}
</urlset>
"""
f = open('sitemap.xml', 'w')
f.write(xml)
f.close()
Finally, it writes the generated XML content into a file named sitemap.xml.
#!/usr/bin/env python
import requests
from bs4 import BeautifulSoup
website = 'https://www.khmernokor.com'
base_url = website
if website.endswith('/'):
base_url = website[:-1]
scanned = []
def clean(a_eles):
links = []
skip_links = []
for a in a_eles:
link = a['href']
if link.startswith('#') or link.startswith('mailto:') or link == '/':
skip_links.append(link)
continue
if link.startswith('/'):
link = '{}{}'.format(base_url, link)
if link.startswith('http://') != True and link.startswith('https://') != True:
link = '{}/{}'.format(base_url, link)
if link.startswith(base_url) is False:
continue
if link not in links:
links.append(link)
return [links, skip_links]
def get_next_scan_urls(urls):
links = []
for u in urls:
if u not in scanned:
links.append(u)
return links
def scan(url):
if url not in scanned:
print('Scan url: {}'.format(url))
scanned.append(url)
data = requests.get(url)
soup = BeautifulSoup(data.text, 'html5lib')
a_eles = soup.find_all('a', href=True)
links, skip_links = clean(a_eles)
next_scan_urls = get_next_scan_urls(links)
print('Count next scan: {}'.format(len(next_scan_urls)))
if len(next_scan_urls) != 0:
for l in next_scan_urls:
scan(l)
return scanned
def main():
links = scan(website)
urls = ''
for l in links:
urls += f"""
<url>
<loc>{l}</loc>
<lastmod>2022-07-27T02:24:08.242Z</lastmod>
<priority>0.6</priority>
</url>
"""
xml = f"""
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{urls}
</urlset>
"""
f = open('sitemap.xml', 'w')
f.write(xml)
f.close()
if __name__ == '__main__':
main()
Output:
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://www.khmernokor.com</loc>
<lastmod>2022-07-27T02:24:08.242Z</lastmod>
<priority>0.6</priority>
</url>
<url>
<loc>https://www.khmernokor.com/question-answers/jiwn5gvqpu</loc>
<lastmod>2022-07-27T02:24:08.242Z</lastmod>
<priority>0.6</priority>
</url>
<url>
<loc>https://www.khmernokor.com/bun98</loc>
<lastmod>2022-07-27T02:24:08.242Z</lastmod>
<priority>0.6</priority>
</url>
<url>
<loc>https://www.khmernokor.com/yuravandy</loc>
<lastmod>2022-07-27T02:24:08.242Z</lastmod>
<priority>0.6</priority>
</url>
// ....
</urlset>
Hope this article can give you some idea for generate sitemap.