EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
VPN SVR 关于博主 Dataset 多线程 DeepStream LoRA CUDA mmap XML Bin Qwen2 云服务器 Miniforge CEIR Pickle Pandas LeetCode scipy InvalidArgumentError Numpy transformers FP64 Template ONNX Safetensors v2ray ModelScope Password YOLO Hungarian Firewall CC LaTeX Pillow Hilton Quantize icon Augmentation Sklearn Michelin uWSGI Algorithm 报税 GPT4 RAR LLAMA Transformers Data Proxy diffusers logger Cloudreve Shortcut CLAP 版权 Domain Heatmap BTC GGML Breakpoint Website 腾讯云 Card CSV Jupyter llama.cpp PDB GoogLeNet Translation Color Distillation WebCrawler printf tqdm Windows Baidu Plate Qwen2.5 AI 图标 Datetime Search WAN ChatGPT SQLite VGG-16 hf Gemma Interview git 强化学习 Plotly PIP Animate Excel Food 证件照 Rebuttal Review Image2Text CTC tar OpenAI SAM 多进程 Nginx JSON 签证 COCO Streamlit 搞笑 算法题 Git Pytorch 继承 FastAPI Clash TensorRT SQL Qwen BeautifulSoup FP16 Conda OpenCV Docker PyCharm Quantization Use GIT XGBoost Bipartite EXCEL VSCode Linux TTS HuggingFace CV Diagram LLM Python Tensor 飞书 Jetson Vmess C++ OCR Github GPTQ Attention Freesound Math 递归学习法 TensorFlow MD5 Hotel Base64 SPIE QWEN Tiktoken Knowledge NLTK IndexTTS2 Statistics Anaconda Logo Permission Paddle v0.dev UI FP32 Video Mixtral Django Crawler Land Disk 公式 Bitcoin NLP torchinfo Random Tracking Google 顶会 Ptyhon Zip TSV git-lfs Markdown HaggingFace Bert 域名 uwsgi Magnet FlashAttention Input RGB UNIX 财报 第一性原理 Agent Claude News BF16 净利润 DeepSeek FP8 图形思考法 PDF CAM ResNet-50 阿里云 音频 Ubuntu Vim Web Paper NameSilo PyTorch Llama API
站点统计

本站现有博文323篇,共被浏览795460

本站已经建立2493天!

热门文章
文章归档
回到顶部