2026/4/6 4:12:14
网站建设
项目流程
用jsp做校园网站,wordpress如何添加视频,网站建设 怎样找客户,福州做公司网站引言#xff1a;在线教育时代的数据获取需求随着在线教育的蓬勃发展#xff0c;各种在线课程平台如雨后春笋般涌现。对于学习者、教育研究者和内容分析者来说#xff0c;获取这些平台的课程信息具有重要价值。本文将介绍如何使用最新的Python爬虫技术#xff0c;特别是异步…引言在线教育时代的数据获取需求随着在线教育的蓬勃发展各种在线课程平台如雨后春笋般涌现。对于学习者、教育研究者和内容分析者来说获取这些平台的课程信息具有重要价值。本文将介绍如何使用最新的Python爬虫技术特别是异步编程技术高效、稳定地爬取在线课程信息。技术选型现代Python爬虫技术栈异步编程使用asyncio和aiohttp实现高并发爬取解析工具parsel基于Scrapy Selector的独立库和BeautifulSoup4浏览器自动化playwright处理JavaScript渲染页面数据存储pandas和sqlalchemy进行结构化存储代理和反爬使用代理池和随机User-Agent1. 环境配置与依赖安装bash# 安装必要依赖 pip install aiohttp asyncio parsel beautifulsoup4 pandas sqlalchemy playwright python -m playwright install # 安装浏览器驱动2. 基础配置模块python# config.py import asyncio from dataclasses import dataclass from typing import List, Optional import random dataclass class CrawlerConfig: 爬虫配置类 # 请求配置 MAX_CONCURRENT_REQUESTS: int 10 REQUEST_TIMEOUT: int 30 RETRY_TIMES: int 3 # 爬取延迟避免被封 MIN_DELAY: float 0.5 MAX_DELAY: float 2.0 # 代理配置 USE_PROXY: bool False PROXY_POOL: List[str] None # 输出配置 OUTPUT_FORMAT: str csv # csv, json, database OUTPUT_FILE: str courses_data.csv # 数据库配置 DATABASE_URL: str sqlite:///courses.db class UserAgentManager: User-Agent管理类 DESKTOP_AGENTS [ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36, Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36, Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0, Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15 ] MOBILE_AGENTS [ Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1, Mozilla/5.0 (Linux; Android 14; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.210 Mobile Safari/537.36 ] classmethod def get_random_ua(cls, device_type: str desktop) - str: 获取随机User-Agent if device_type mobile: return random.choice(cls.MOBILE_AGENTS) return random.choice(cls.DESKTOP_AGENTS)3. 异步爬虫核心引擎python# crawler_engine.py import aiohttp import asyncio import logging from typing import Dict, Any, Optional from datetime import datetime import backoff from config import CrawlerConfig, UserAgentManager class AsyncCrawlerEngine: 异步爬虫引擎 def __init__(self, config: CrawlerConfig): self.config config self.session None self.semaphore asyncio.Semaphore(config.MAX_CONCURRENT_REQUESTS) self.logger self._setup_logger() def _setup_logger(self): 配置日志系统 logger logging.getLogger(__name__) logger.setLevel(logging.INFO) # 控制台处理器 console_handler logging.StreamHandler() console_handler.setLevel(logging.INFO) # 文件处理器 file_handler logging.FileHandler( fcrawler_{datetime.now().strftime(%Y%m%d_%H%M%S)}.log ) file_handler.setLevel(logging.DEBUG) # 格式化器 formatter logging.Formatter( %(asctime)s - %(name)s - %(levelname)s - %(message)s ) console_handler.setFormatter(formatter) file_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.addHandler(file_handler) return logger async def __aenter__(self): 异步上下文管理器入口 self.session aiohttp.ClientSession( timeoutaiohttp.ClientTimeout(totalself.config.REQUEST_TIMEOUT), headers{User-Agent: UserAgentManager.get_random_ua()} ) return self async def __aexit__(self, exc_type, exc_val, exc_tb): 异步上下文管理器出口 if self.session: await self.session.close() backoff.on_exception( backoff.expo, (aiohttp.ClientError, asyncio.TimeoutError), max_tries3 ) async def fetch_url(self, url: str, **kwargs) - Optional[str]: 异步获取URL内容 async with self.semaphore: try: headers kwargs.get(headers, {}) headers[User-Agent] UserAgentManager.get_random_ua() proxy None if self.config.USE_PROXY and self.config.PROXY_POOL: proxy random.choice(self.config.PROXY_POOL) async with self.session.get( url, headersheaders, proxyproxy, **kwargs ) as response: response.raise_for_status() # 随机延迟避免请求过快 await asyncio.sleep( random.uniform( self.config.MIN_DELAY, self.config.MAX_DELAY ) ) return await response.text() except Exception as e: self.logger.error(f请求失败 {url}: {str(e)}) return None async def fetch_multiple_urls(self, urls: list) - Dict[str, Optional[str]]: 并发获取多个URL tasks [self.fetch_url(url) for url in urls] results await asyncio.gather(*tasks, return_exceptionsTrue) return { url: result if not isinstance(result, Exception) else None for url, result in zip(urls, results) }4. 智能解析模块python# parser.py from typing import Dict, List, Optional, Any from parsel import Selector from bs4 import BeautifulSoup import json import re class CourseParser: 课程信息解析器 staticmethod def parse_with_parsel(html: str, base_url: str ) - Dict[str, Any]: 使用Parsel解析HTML selector Selector(texthtml) # 尝试多种选择器策略 course_data { title: CourseParser._extract_title(selector), instructor: CourseParser._extract_instructor(selector), price: CourseParser._extract_price(selector), rating: CourseParser._extract_rating(selector), enrollment_count: CourseParser._extract_enrollment(selector), duration: CourseParser._extract_duration(selector), category: CourseParser._extract_category(selector), description: CourseParser._extract_description(selector), source_url: base_url, platform: CourseParser._detect_platform(base_url) } return course_data staticmethod def parse_json_ld(html: str) - Optional[Dict[str, Any]]: 解析JSON-LD结构化数据 soup BeautifulSoup(html, html.parser) json_ld_scripts soup.find_all( script, typeapplication/ldjson ) for script in json_ld_scripts: try: data json.loads(script.string) if data.get(type) in [Course, Product, CreativeWork]: return data except: continue return None staticmethod def _extract_title(selector: Selector) - str: 提取课程标题 selectors [ //h1[classcourse-title]/text(), //h1[contains(class, title)]/text(), //meta[propertyog:title]/content, //title/text() ] for xpath in selectors: result selector.xpath(xpath).get() if result and result.strip(): return result.strip() return staticmethod def _extract_price(selector: Selector) - float: 提取课程价格 price_selectors [ //span[classprice]/text(), //*[contains(class, price)]/text(), //meta[propertyproduct:price:amount]/content ] for xpath in price_selectors: price_text selector.xpath(xpath).get() if price_text: # 提取数字 numbers re.findall(r\d\.?\d*, price_text) if numbers: return float(numbers[0]) return 0.0 staticmethod def _extract_rating(selector: Selector) - float: 提取课程评分 rating_selectors [ //meta[propertyog:rating]/content, //span[classrating]/text(), //*[contains(class, rating)]/aria-label ] for xpath in rating_selectors: rating_text selector.xpath(xpath).get() if rating_text: numbers re.findall(r\d\.?\d*, rating_text) if numbers: return float(numbers[0]) return 0.0 staticmethod def _detect_platform(url: str) - str: 检测课程平台 platforms { coursera.org: Coursera, udemy.com: Udemy, edx.org: edX, khanacademy.org: Khan Academy, linkedin.com/learning: LinkedIn Learning, skillshare.com: Skillshare, pluralsight.com: Pluralsight } for domain, name in platforms.items(): if domain in url: return name return Unknown # 其他提取方法类似限于篇幅不全部展开 staticmethod def _extract_instructor(selector: Selector) - str: # 提取讲师信息 pass staticmethod def _extract_enrollment(selector: Selector) - int: # 提取报名人数 pass staticmethod def _extract_duration(selector: Selector) - str: # 提取课程时长 pass staticmethod def _extract_category(selector: Selector) - str: # 提取课程分类 pass staticmethod def _extract_description(selector: Selector) - str: # 提取课程描述 pass class JavaScriptRenderer: JavaScript渲染处理器 def __init__(self): self.browser None async def __aenter__(self): from playwright.async_api import async_playwright self.playwright await async_playwright().start() self.browser await self.playwright.chromium.launch( headlessTrue, args[--disable-blink-featuresAutomationControlled] ) return self async def __aexit__(self, exc_type, exc_val, exc_tb): if self.browser: await self.browser.close() if hasattr(self, playwright): await self.playwright.stop() async def render_page(self, url: str) - str: 渲染JavaScript页面 context await self.browser.new_context( user_agentUserAgentManager.get_random_ua(), viewport{width: 1920, height: 1080} ) page await context.new_page() # 添加反反爬措施 await page.add_init_script( Object.defineProperty(navigator, webdriver, { get: () undefined }); ) try: await page.goto(url, wait_untilnetworkidle) # 等待页面加载完成 await page.wait_for_timeout(2000) return await page.content() finally: await page.close() await context.close()5. 数据存储模块python# storage.py import pandas as pd import json import sqlite3 from sqlalchemy import create_engine, Table, Column, MetaData from sqlalchemy import String, Float, Integer, Text, DateTime from datetime import datetime from typing import List, Dict, Any class DataStorage: 数据存储管理器 def __init__(self, config: CrawlerConfig): self.config config self.data_buffer [] def add_to_buffer(self, course_data: Dict[str, Any]): 添加数据到缓冲区 course_data[crawled_at] datetime.now() self.data_buffer.append(course_data) # 缓冲区达到一定大小后自动保存 if len(self.data_buffer) 100: self.flush_buffer() def flush_buffer(self): 刷新缓冲区到存储 if not self.data_buffer: return if self.config.OUTPUT_FORMAT csv: self._save_to_csv() elif self.config.OUTPUT_FORMAT json: self._save_to_json() elif self.config.OUTPUT_FORMAT database: self._save_to_database() self.data_buffer.clear() print(f数据已保存共{len(self.data_buffer)}条记录) def _save_to_csv(self): 保存为CSV文件 df pd.DataFrame(self.data_buffer) # 如果文件已存在追加数据 try: existing_df pd.read_csv(self.config.OUTPUT_FILE) df pd.concat([existing_df, df], ignore_indexTrue) except FileNotFoundError: pass df.to_csv(self.config.OUTPUT_FILE, indexFalse, encodingutf-8-sig) def _save_to_json(self): 保存为JSON文件 try: with open(self.config.OUTPUT_FILE, r, encodingutf-8) as f: existing_data json.load(f) existing_data.extend(self.data_buffer) except FileNotFoundError: existing_data self.data_buffer with open(self.config.OUTPUT_FILE, w, encodingutf-8) as f: json.dump(existing_data, f, ensure_asciiFalse, indent2) def _save_to_database(self): 保存到数据库 engine create_engine(self.config.DATABASE_URL) # 定义表结构 metadata MetaData() courses_table Table( courses, metadata, Column(id, Integer, primary_keyTrue, autoincrementTrue), Column(title, String(500)), Column(instructor, String(200)), Column(price, Float), Column(rating, Float), Column(enrollment_count, Integer), Column(duration, String(50)), Column(category, String(100)), Column(description, Text), Column(platform, String(100)), Column(source_url, String(500)), Column(crawled_at, DateTime), Column(created_at, DateTime, defaultdatetime.now) ) # 创建表如果不存在 metadata.create_all(engine) # 插入数据 df pd.DataFrame(self.data_buffer) df.to_sql(courses, engine, if_existsappend, indexFalse) def close(self): 关闭存储确保所有数据已保存 if self.data_buffer: self.flush_buffer()6. 主爬虫类python# main_crawler.py import asyncio from typing import List, Dict, Any from urllib.parse import urljoin, urlparse import re from config import CrawlerConfig from crawler_engine import AsyncCrawlerEngine from parser import CourseParser, JavaScriptRenderer from storage import DataStorage class OnlineCourseCrawler: 在线课程爬虫主类 def __init__(self, config: CrawlerConfig): self.config config self.crawler_engine AsyncCrawlerEngine(config) self.storage DataStorage(config) self.visited_urls set() self.platform_patterns { coursera: rhttps?://www\.coursera\.org/learn/[^/], udemy: rhttps?://www\.udemy\.com/course/[^/], edx: rhttps?://www\.edx\.org/learn/[^/], skillshare: rhttps?://www\.skillshare\.com/[^/] } async def crawl_course_page(self, url: str) - Dict[str, Any]: 爬取单个课程页面 # 检查是否已访问 if url in self.visited_urls: return {} self.visited_urls.add(url) # 尝试直接获取 html await self.crawler_engine.fetch_url(url) # 如果返回空可能是JavaScript渲染的页面 if not html or len(html) 1000: try: async with JavaScriptRenderer() as renderer: html await renderer.render_page(url) except Exception as e: print(fJavaScript渲染失败 {url}: {str(e)}) return {} if not html: return {} # 解析课程信息 course_data CourseParser.parse_with_parsel(html, url) # 尝试解析JSON-LD json_ld_data CourseParser.parse_json_ld(html) if json_ld_data: course_data.update(self._extract_from_json_ld(json_ld_data)) # 保存数据 if course_data.get(title): self.storage.add_to_buffer(course_data) print(f成功爬取: {course_data[title]}) return course_data async def crawl_course_list(self, list_url: str, max_pages: int 10): 爬取课程列表页 for page in range(1, max_pages 1): # 构建分页URL不同平台分页方式不同 paginated_url self._build_pagination_url(list_url, page) html await self.crawler_engine.fetch_url(paginated_url) if not html: continue # 提取课程详情页链接 course_links self._extract_course_links(html, list_url) # 并发爬取所有课程详情页 tasks [self.crawl_course_page(link) for link in course_links] await asyncio.gather(*tasks) # 检查是否还有下一页 if not self._has_next_page(html): break def _extract_course_links(self, html: str, base_url: str) - List[str]: 从列表页提取课程链接 selector Selector(texthtml) # 尝试多种选择器 link_selectors [ //a[contains(href, course)]/href, //a[contains(class, course-link)]/href, //article//a/href, //div[data-testidcourse-card]//a/href ] links set() for xpath in link_selectors: found_links selector.xpath(xpath).getall() for link in found_links: # 转换为绝对URL absolute_url urljoin(base_url, link) # 验证是否为课程URL if self._is_course_url(absolute_url): links.add(absolute_url) return list(links)[:20] # 限制每次爬取的数量 def _is_course_url(self, url: str) - bool: 判断URL是否为课程详情页 for platform, pattern in self.platform_patterns.items(): if re.match(pattern, url): return True return False def _build_pagination_url(self, base_url: str, page: int) - str: 构建分页URL parsed urlparse(base_url) # 根据不同平台构建分页URL if coursera in parsed.netloc: return f{base_url}?page{page} elif udemy in parsed.netloc: return f{base_url}?p{page} elif edx in parsed.netloc: return f{base_url}?page{page} # 默认处理 return f{base_url}?page{page} def _has_next_page(self, html: str) - bool: 检查是否有下一页 selector Selector(texthtml) next_buttons selector.xpath( //a[contains(text(), Next) or contains(class, next)] ) return len(next_buttons) 0 def _extract_from_json_ld(self, json_ld: Dict) - Dict[str, Any]: 从JSON-LD提取数据 extracted {} mapping { name: title, description: description, provider: instructor, aggregateRating.ratingValue: rating, offers.price: price, offers.priceCurrency: currency } for json_key, data_key in mapping.items(): value self._get_nested_value(json_ld, json_key) if value: extracted[data_key] value return extracted def _get_nested_value(self, data: Dict, key_path: str): 获取嵌套字典的值 keys key_path.split(.) value data for key in keys: if isinstance(value, dict): value value.get(key) else: return None return value async def run(self, start_urls: List[str]): 运行爬虫 async with self.crawler_engine: # 爬取每个起始URL for url in start_urls: if /course/ in url or /learn/ in url: # 如果是课程详情页 await self.crawl_course_page(url) else: # 如果是列表页 await self.crawl_course_list(url, max_pages5) # 确保所有数据已保存 self.storage.close() print(爬取完成)7. 使用示例和主程序python# main.py import asyncio from main_crawler import OnlineCourseCrawler from config import CrawlerConfig async def main(): # 配置爬虫 config CrawlerConfig( MAX_CONCURRENT_REQUESTS5, REQUEST_TIMEOUT30, OUTPUT_FORMATcsv, OUTPUT_FILEcourses_data.csv, MIN_DELAY1.0, MAX_DELAY3.0 ) # 起始URL示例 start_urls [ https://www.coursera.org/browse, https://www.udemy.com/courses/development/, https://www.edx.org/learn/computer-science ] # 创建爬虫实例 crawler OnlineCourseCrawler(config) try: # 运行爬虫 await crawler.run(start_urls) # 数据分析和报告 await generate_report() except KeyboardInterrupt: print(\n爬虫被用户中断) except Exception as e: print(f爬虫运行出错: {str(e)}) async def generate_report(): 生成爬取报告 import pandas as pd import matplotlib.pyplot as plt try: df pd.read_csv(courses_data.csv) print( * 50) print(爬取数据统计报告) print( * 50) # 基本统计 print(f总课程数: {len(df)}) print(f平台分布:) print(df[platform].value_counts()) # 价格分析 print(f\n价格统计:) print(f平均价格: ${df[price].mean():.2f}) print(f最高价格: ${df[price].max():.2f}) print(f最低价格: ${df[price].min():.2f}) # 评分分析 print(f\n评分统计:) print(f平均评分: {df[rating].mean():.2f}/5) # 保存图表 plt.figure(figsize(12, 6)) # 平台分布饼图 plt.subplot(1, 2, 1) df[platform].value_counts().plot.pie(autopct%1.1f%%) plt.title(课程平台分布) # 价格分布直方图 plt.subplot(1, 2, 2) df[price].hist(bins20, edgecolorblack) plt.title(课程价格分布) plt.xlabel(价格($)) plt.ylabel(课程数量) plt.tight_layout() plt.savefig(course_analysis.png, dpi300) plt.close() print(\n分析报告已生成: course_analysis.png) except FileNotFoundError: print(未找到数据文件请先运行爬虫) except Exception as e: print(f生成报告时出错: {str(e)}) if __name__ __main__: # 设置事件循环策略Windows系统需要 try: asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) except AttributeError: pass # 运行主程序 asyncio.run(main())8. 进阶功能API接口和分布式扩展python# api_service.py from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import List, Optional import asyncio app FastAPI(title在线课程爬虫API) class CrawlRequest(BaseModel): urls: List[str] max_pages: int 5 output_format: str json class CrawlResponse(BaseModel): job_id: str status: str message: str class CourseData(BaseModel): title: str platform: str price: float rating: float url: str # 这里可以添加API端点用于控制爬虫 # 限于篇幅不展开完整实现 # 分布式扩展提示 # 1. 使用Redis作为任务队列 # 2. 使用Celery或RQ进行任务分发 # 3. 使用Docker容器化部署 # 4. 使用Kubernetes进行集群管理爬虫最佳实践和注意事项1. 遵守Robots协议python# robots_checker.py import urllib.robotparser def check_robots_permission(url: str, user_agent: str *) - bool: 检查Robots协议许可 parsed urlparse(url) robots_url f{parsed.scheme}://{parsed.netloc}/robots.txt rp urllib.robotparser.RobotFileParser() rp.set_url(robots_url) rp.read() return rp.can_fetch(user_agent, url)2. 伦理和法律考虑仅爬取公开可访问的数据尊重网站的服务条款不要对服务器造成过大负担遵守数据保护法规如GDPR3. 性能优化建议使用连接池复用HTTP连接实现增量爬取避免重复爬取使用Bloom Filter检查URL是否已访问实现断点续爬功能总结本文详细介绍了如何使用现代Python技术构建一个功能完善的在线课程信息爬虫。我们涵盖了从基础请求到高级JavaScript渲染从数据解析到存储分析的完整流程。关键技术点包括异步并发编程显著提高爬取效率智能解析策略多种解析方法结合提高成功率反反爬虫措施合理使用延迟、代理和浏览器自动化结构化存储支持多种输出格式完整监控体系日志记录和错误处理