python出现RuntimeError错误问题及解决

时间：2022-08-08 08:20:35|栏目：Python代码|点击：次

下面是出现的错误解释

RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.

This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:

if __name__ == '__main__':
freeze_support()
...

The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.

下面是出现错误代码的原代码

import multiprocessing as mp
import time
from urllib.request import urlopen,urljoin
from bs4 import BeautifulSoup
import re
 
base_url = "https://morvanzhou.github.io/"
 
#crawl爬取网页
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)
    return response.read().decode()
 
#parse解析网页
def parse(html):
    soup = BeautifulSoup(html,'html.parser')
    urls = soup.find_all('a',{"href":re.compile('^/.+?/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url,url['href'])for url in urls])
    url = soup.find('meta',{'property':"og:url"})['content']
    return title,page_urls,url
 
unseen = set([base_url])
seen = set()
restricted_crawl = True
 
pool = mp.Pool(4)
count, t1 = 1, time.time()
while len(unseen) != 0:                 # still get some url to visit
    if restricted_crawl and len(seen) > 20:
            break
    print('\nDistributed Crawling...')
    crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
    htmls = [j.get() for j in crawl_jobs]      # request connection
 
    print('\nDistributed Parsing...')
    parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
    results = [j.get() for j in parse_jobs]    # parse html
 
    print('\nAnalysing...')
    seen.update(unseen)         # seen the crawled
    unseen.clear()              # nothing unseen
 
    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)     # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1))    # 16 s !!!

这是修改后的正确代码

import multiprocessing as mp
import time
from urllib.request import urlopen,urljoin
from bs4 import BeautifulSoup
import re
 
base_url = "https://morvanzhou.github.io/"
 
#crawl爬取网页
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)
    return response.read().decode()
 
#parse解析网页
def parse(html):
    soup = BeautifulSoup(html,'html.parser')
    urls = soup.find_all('a',{"href":re.compile('^/.+?/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url,url['href'])for url in urls])
    url = soup.find('meta',{'property':"og:url"})['content']
    return title,page_urls,url
 
def main():
    unseen = set([base_url])
    seen = set()
    restricted_crawl = True
 
    pool = mp.Pool(4)
    count, t1 = 1, time.time()
    while len(unseen) != 0:                 # still get some url to visit
        if restricted_crawl and len(seen) > 20:
                break
        print('\nDistributed Crawling...')
        crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
        htmls = [j.get() for j in crawl_jobs]      # request connection
 
        print('\nDistributed Parsing...')
        parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
        results = [j.get() for j in parse_jobs]    # parse html
 
        print('\nAnalysing...')
        seen.update(unseen)         # seen the crawled
        unseen.clear()              # nothing unseen
 
        for title, page_urls, url in results:
            print(count, title, url)
            count += 1
            unseen.update(page_urls - seen)     # get new url to crawl
    print('Total time: %.1f s' % (time.time()-t1))    # 16 s !!!
 
 
if __name__ == '__main__':
    main()

综上可知，就是把你的运行代码整合成一个函数，然后加入