Crawlers in action – people.com.cn

Time:2024-4-8

preamble

In order to consolidate the knowledge gained, the author tries to start posting some study note type blogs for future review. Of course, if you can help some of the budding new technology learning that is also excellent. The author is a vegetable, if there are errors in the article, readers are welcome to criticize and correct.
(The reference source code for the blog can be found in the resources on my homepage, if you have any questions during the learning process you are welcome to ask me in the comments section)

Discover the treasure.

The other day I found a giant AI learning website that is easy to understand and funny, I can’t help but share it with you. [Treasure Entrance】。

http://jhsjk.people.cn/testnew/result

import os
import re
from datetime import datetime
import requests
import json
from bs4 import BeautifulSoup
from pymongo import MongoClient
from tqdm import tqdm
class ArticleCrawler:
def __init__(self, catalogues_url, card_root_url, output_dir, db_name='ren-ming-wang'):
self.catalogues_url = catalogues_url
self.card_root_url = card_root_url
self.output_dir = output_dir
self.client = MongoClient('mongodb://localhost:27017/')
self.db = self.client[db_name]
self.catalogues = self.db['catalogues']
self.cards = self.db['cards']
self.headers = {
'Referer': 'https://jhsjk.people.cn/result?',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/119.0.0.0 Safari/537.36',
'Cookie': 'Replace with your own',
}
# Send a get request with parameters and get the content of the page
def fetch_page(self, url, page):
params = {
'keywords': '',
'isFuzzy': '0',
'searchArea': '0',
'year': '0',
'form': '',
'type': '0',
'page': page,
'origin': 'all ',
'source': '2',
}
response = requests.get(url, params=params, headers=self.headers)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
# Parsing the request layout
def parse_catalogues(self, json_catalogues):
card_list = json_catalogues['list']
for list in card_list:
a_tag = 'article/'+list['article_id']
card_url = self.card_root_url + a_tag
card_title = list['title']
updateTime = list['input_date']
self.parse_cards(card_url, updateTime)
date = datetime.now()
catalogues_id = list['article_id']+'01'
# Check for duplicate titles
existing_docs = self.catalogues.find_one({'id': catalogues_id})
if existing_docs is not None:
print(f'layout id: {catalogues_id}[already exists]')
continue
card_data = {
'id': catalogues_id,
'title': card_title,
'page': 1,
'serial': 1,
# One page, one article
'dailyId': '',
'cardSize': 1,
'subjectCode': '50',
'updateTime': updateTime,
'institutionnCode': '10000',
'date': date,
'snapshot': {
}
}
self.catalogues.insert_one(card_data)
print(f'layout id: {catalogues_id}[inserted successfully]')
# Parsing Requested Articles
def parse_cards(self, url, updateTime):
response = requests.get(url, headers=self.headers)
soup = BeautifulSoup(response.text, "html.parser")
try:
title = soup.find("div", "d2txt clearfix").find('h1').text
except:
try:
title = soup.find('h1').text
except:
print(f'[Could not parse the title of this article] {url}')
html_content = soup.find('div', 'd2txt_con clearfix')
text = html_content.get_text()
imgs = [img.get('src') or img.get('data-src') for img in html_content.find_all('img')]
cleaned_content = self.clean_content(text)
# Suppose we have a regular expression match object match
match = re.search(r'\d+', url)
# Get matching strings
card_id = match.group()
date = datetime.now()
if len(imgs) != 0:
# Download images
self.download_images(imgs, card_id)
# Create Documentation
document = {
'id': card_id,
'serial': 1,
'page': 1,
'url' : url,
'type': 'ren-ming-wang',
'catalogueId': card_id + '01',
'subjectCode': '50',
'institutionCode': '10000',
'updateTime': updateTime,
'flag': 'true',
'date': date,
'title': title,
'illustrations': imgs,
'html_content': str(html_content),
'content': cleaned_content
}
# Check for duplicate titles
existing_docs = self.cards.find_one({'id': card_id})
if existing_docs is None:
# Insert document
self.cards.insert_one(document)
print(f "Article id: {card_id}[inserted successfully]")
else:
print(f "Article id: {card_id} [already exists]")
# Article data cleansing
def clean_content(self, content):
if content is not None:
content = re.sub(r'\r', r'\n', content)
content = re.sub(r'\n{2,}', '', content)
# content = re.sub(r'\n', '', content)
content = re.sub(r' {6,}', '', content)
content = re.sub(r' {3,}\n', '', content)
content = content.replace('<P>', '').replace('<\P>', '').replace('&nbsp;', ' ')
return content
# Download images
def download_images(self, img_urls, card_id):
# Create a new subdirectory based on card_id
images_dir = os.path.join(self.output_dir, card_id)
if not os.path.exists(images_dir):
os.makedirs(images_dir)
downloaded_images = []
for img_url in img_urls:
try:
response = requests.get(img_url, stream=True)
if response.status_code == 200:
# Extract image filenames from URLs
image_name = os.path.join(images_dir, img_url.split('/')[-1])
# Ensure that file names are not duplicated
if os.path.exists(image_name):
continue
with open(image_name, 'wb') as f:
f.write(response.content)
downloaded_images.append(image_name)
print(f"Image downloaded: {img_url}")
except Exception as e:
print(f"Failed to download image {img_url}. Error: {e}")
return downloaded_images
# Skip if folder exists
else:
print(f'The image folder with article id {card_id} already exists')
# Find out how many pages
def find_page_all(self, soup):
# Find <em>tags
em_tag = soup.find('em', onclick=True)
# Extract page number from onclick attribute
if em_tag and 'onclick' in em_tag.attrs:
onclick_value = em_tag['onclick']
page_number = int(onclick_value.split('(')[1].split(')')[0])
return page_number
else:
print('Couldn't find the total number of pages of data')
# Close the connection to MongoDB
def close_connection(self):
self.client.close()
# Execute a crawler that loops through multiple page layouts and articles and stores them
def run(self):
soup_catalogue = self.fetch_page(self.catalogues_url, 1)
page_all = self.find_page_all(soup_catalogue)
if page_all:
for index in tqdm(range(1, page_all), desc='Page'):
# for index in tqdm(range(1, 50), desc='Page'):
soup_catalogues = self.fetch_page(self.catalogues_url, index).text
# Parsing JSON data
soup_catalogues_json = json.loads(soup_catalogues)
self.parse_catalogues(soup_catalogues_json)
print(f'======================================Finished page {index}======================================')
self.close_connection()
if __name__ == "__main__":
crawler = ArticleCrawler(
catalogues_url='http://jhsjk.people.cn/testnew/result',
card_root_url='http://jhsjk.people.cn/',
output_dir='D:\\ren-ming-wang\\img'
)
crawler.run() # Run the crawler, searching for all content
crawler.close_connection() # close the database connection

Crawlers in action - people.com.cn
Crawlers in action - people.com.cn

Crawlers in action - people.com.cn
Crawlers in action - people.com.cn

Recommended Today

[linux] Permission Understanding

catalogs 1. shell commands and how they work 2. The concept of authority 3. Authority management 2.1 Classification of document visitors (persons) 2.2 File types and access rights (thing attributes) 2.3 Representation of file permission values 2.4 Methods for setting file access rights 3. The file directive 4. Permissions for directories★ 5. Sticky bits★ 6. […]