EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
Video JSON 版权 LeetCode Plotly Bitcoin Vmess 财报 Bipartite 证件照 Statistics v0.dev 阿里云 公式 Qwen uWSGI RAR logger CUDA LaTeX Website DeepStream Github PDF TensorRT Attention 飞书 HaggingFace VSCode Hungarian Nginx UI LLM hf Baidu Template v2ray Agent Bert torchinfo Ubuntu News Google 多进程 FastAPI SPIE FlashAttention Distillation Jetson mmap SAM API Linux PyCharm Math Docker GGML FP8 递归学习法 IndexTTS2 Breakpoint CSV 论文 Safetensors Anaconda Paper Domain C++ VPN 搞笑 腾讯云 Quantize Tiktoken GPT4 printf git 报税 Heatmap Shortcut Cloudreve CC Rebuttal 净利润 Ptyhon Dataset CV Card Proxy 顶会 云服务器 CEIR DeepSeek Pytorch BTC transformers 算法题 Logo AI Windows Mixtral HuggingFace ResNet-50 Use WebCrawler icon Markdown OCR XML LLAMA 图形思考法 Crawler FP64 tar Diagram Plate Web 图标 Permission QWEN ChatGPT Freesound Clash llama.cpp Tensor CLAP OpenAI Disk FP16 Algorithm tqdm NLP Pickle BeautifulSoup 多线程 Knowledge 强化学习 Interview Quantization Vim Python 第一性原理 域名 Paddle PDB Qwen2 RGB YOLO CAM Streamlit Tracking Image2Text BF16 Qwen2.5 Bin scipy git-lfs Claude GPTQ XGBoost TensorFlow Hilton MD5 Transformers Review Color GoogLeNet PIP TTS GIT UNIX Input Magnet Base64 Numpy VGG-16 OpenCV 签证 ModelScope 继承 Gemma EXCEL LoRA Llama Translation Pillow Miniforge Jupyter Conda 论文速读 Password Animate 关于博主 Land TSV Sklearn PyTorch Excel Augmentation Hotel COCO Search SQLite Git SVR WAN Zip ONNX FP32 Datetime Firewall NameSilo Food uwsgi Michelin Random Pandas diffusers NLTK InvalidArgumentError Django SQL CTC Data 音频
站点统计

本站现有博文327篇,共被浏览833644

本站已经建立2538天!

热门文章
文章归档
回到顶部