{"id":1960,"date":"2023-11-29T14:35:30","date_gmt":"2023-11-29T06:35:30","guid":{"rendered":"http:\/\/zhang.mba\/?p=1960"},"modified":"2023-11-29T14:41:24","modified_gmt":"2023-11-29T06:41:24","slug":"flask-ji-xian-tiao-zhankai-fa-chu-quan-wen-sou-suo","status":"publish","type":"post","link":"https:\/\/zhang.mba\/index.php\/2023\/11\/29\/14\/35\/30\/1960\/flask-ji-xian-tiao-zhankai-fa-chu-quan-wen-sou-suo\/python\/zhangzhiqi\/","title":{"rendered":"Flask\u6781\u9650\u6311\u6218\u2014\u2014\u5f00\u53d1\u51fa\u5168\u6587\u641c\u7d22\u5f15\u64ce"},"content":{"rendered":"<p>\u00a0 \u00a0\u4eca\u5929\u5b66\u4e60\u8fc7\u7a0b\u4e2d\u5199\u4e86\u4e00\u4e2aflask\u641c\u7d22\u5f15\u64ce\u5305\u62ec\u722c\u866b\u548c\u641c\u7d22\u9875\u9762\u3002\u529f\u80fd\u6709\u5168\u6587\u641c\u7d22\uff0c\u5206\u9875\uff0c\u722c\u866b\u7b49\u7b49\u3002<\/p>\n<p>\u53ea\u9700\u5b89\u88c5flask\u548cjieba\u5373\u53ef:<\/p>\n<pre class=\"line-numbers\"><code class=\"language-python\">pip install flask jieba\n<\/code><\/pre>\n<p><div class='fancybox-wrapper lazyload-container-unload' data-fancybox='post-images' href='https:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/sHJnM03rkibpib5FmE1MqdU0EFawDJmRltksicHX4O0p46ZUI0kY7icnsDLAR8iaJsMK8pgVicecGCdaqXbReyz4r06w\/640?wx_fmt=jpeg&amp;wxfrom=5&amp;wx_lazy=1&amp;wx_co=1'><img class=\"lazyload lazyload-style-2\" src=\"data:image\/svg+xml;base64,PCEtLUFyZ29uTG9hZGluZy0tPgo8c3ZnIHdpZHRoPSIxIiBoZWlnaHQ9IjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjZmZmZmZmMDAiPjxnPjwvZz4KPC9zdmc+\"  data-original=\"https:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/sHJnM03rkibpib5FmE1MqdU0EFawDJmRltksicHX4O0p46ZUI0kY7icnsDLAR8iaJsMK8pgVicecGCdaqXbReyz4r06w\/640?wx_fmt=jpeg&amp;wxfrom=5&amp;wx_lazy=1&amp;wx_co=1\" src=\"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAANSURBVBhXYzh8+PB\/AAffA0nNPuCLAAAAAElFTkSuQmCC\" alt=\"\u56fe\u7247\" \/><\/div><\/p>\n<p><div class='fancybox-wrapper lazyload-container-unload' data-fancybox='post-images' href='https:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/sHJnM03rkibpib5FmE1MqdU0EFawDJmRlthFkdouzmvDUaDVajLy3vVlqbwNkoQNNJXteRqGkvOhydfz516MW5uw\/640?wx_fmt=jpeg&amp;wxfrom=5&amp;wx_lazy=1&amp;wx_co=1'><img class=\"lazyload lazyload-style-2\" src=\"data:image\/svg+xml;base64,PCEtLUFyZ29uTG9hZGluZy0tPgo8c3ZnIHdpZHRoPSIxIiBoZWlnaHQ9IjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjZmZmZmZmMDAiPjxnPjwvZz4KPC9zdmc+\"  data-original=\"https:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/sHJnM03rkibpib5FmE1MqdU0EFawDJmRlthFkdouzmvDUaDVajLy3vVlqbwNkoQNNJXteRqGkvOhydfz516MW5uw\/640?wx_fmt=jpeg&amp;wxfrom=5&amp;wx_lazy=1&amp;wx_co=1\" src=\"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAANSURBVBhXYzh8+PB\/AAffA0nNPuCLAAAAAElFTkSuQmCC\" alt=\"\u56fe\u7247\" \/><\/div><\/p>\n<p>\u641c\u7d22\u5f15\u64ce\u540e\u7aef\uff1a<\/p>\n<pre class=\"line-numbers\"><code class=\"language-python\">from flask import Flask, render_template, request, session, jsonify\nimport sqlite3\nimport jieba\nimport math\nimport string\nimport re\n\napp = Flask(__name__)\nDATABASE = 'data.db'\n\n\ndef create_database():\n    conn = sqlite3.connect(DATABASE)\n    c = conn.cursor()\n    c.execute('''CREATE VIRTUAL TABLE IF NOT EXISTS contents USING fts5(\n        title, url , favicon , description , content , keywords , date,img )''')\n    conn.commit()\n    conn.close()\n\n\ndef tokenize(title):\n    words = [word for word in jieba.cut(title) if word not in string.punctuation]  # \u5206\u8bcd\u5e76\u53bb\u6389\u6807\u70b9\u7b26\u53f7\n    keywords = [word for word in words if len(word) &gt; 1]  # \u53bb\u6389\u5355\u4e2a\u5b57\n    keywords = list(set(keywords))  # \u53bb\u91cd\n    keywords.sort(key=words.index)  # \u6309\u5728title\u4e2d\u51fa\u73b0\u7684\u987a\u5e8f\u6392\u5e8f\n    keyword_str = ' '.join(keywords)  # \u5c06\u5173\u952e\u8bcd\u5217\u8868\u8f6c\u6362\u4e3a\u4ee5\u7a7a\u683c\u5206\u9694\u7684\u5b57\u7b26\u4e32\n    keyword_str = ''.join(filter(lambda x: x not in string.punctuation, keyword_str))  # \u53bb\u6389\u5b57\u7b26\u4e32\u4e2d\u7684\u6807\u70b9\u7b26\u53f7\n    return keyword_str\n\n\ndef search_contents(query, offset, per_page):\n    conn = sqlite3.connect(DATABASE)\n    conn.row_factory = sqlite3.Row\n    c = conn.cursor()\n    c.execute(&quot;SELECT COUNT(*) FROM contents WHERE keywords MATCH :query&quot;,\n              {'query': query})\n    total_results = c.fetchone()[0]  # \u83b7\u53d6\u641c\u7d22\u7ed3\u679c\u603b\u6570\n    total_pages = calculate_total_pages(total_results, per_page)  # \u8ba1\u7b97\u603b\u9875\u6570\n    if offset &gt;= total_results:\n        offset = (total_pages - 1) * per_page\n    c.execute(&quot;SELECT title, url, favicon, description, keywords, date FROM contents WHERE keywords MATCH :query LIMIT :per_page OFFSET :offset&quot;,\n              {'query': query, 'per_page': per_page, 'offset': offset})\n    rows = c.fetchall()\n    conn.close()\n    return {'results': [dict(row) for row in rows],\n            'total_results': total_results,\n            'total_pages': total_pages}\n\n\ndef calculate_total_pages(total_results, per_page):\n    return math.ceil(total_results \/ per_page)\n\n\n@app.before_request\ndef session_online():\n    session_id = request.cookies.get('session_id')\n    online = session.get('Online', 0)\n    if session_id is not None:\n        online += 1\n    session['Online'] = online\n\n\n@app.route('\/get_suggestions')\ndef get_suggestions():\n    query = request.args.get('q')\n\n    conn = sqlite3.connect(DATABASE)\n    c = conn.cursor()\n\n    # \u5728contents\u8868\u4e2d\u67e5\u8be2\u5305\u542b\u8f93\u5165\u5173\u952e\u8bcd\u7684title\u5217\uff0c\u6700\u591a\u8fd4\u56de5\u4e2a\u7ed3\u679c\n    c.execute(&quot;SELECT title FROM contents WHERE title LIKE ? LIMIT 5&quot;, ('%' + query + '%',))\n    suggestions = [row[0] for row in c.fetchall()]\n\n    conn.close()\n\n    return jsonify(suggestions=suggestions)\n\n\n@app.route('\/', methods=['GET'])\ndef index():\n    # \u5904\u7406\u641c\u7d22\u8bf7\u6c42\n    query = request.args.get('q', '')  # \u83b7\u53d6\u67e5\u8be2\u5173\u952e\u8bcd\uff0c\u9ed8\u8ba4\u4e3a\u7a7a\u5b57\u7b26\u4e32\n    page = request.args.get('page', '1')  # \u83b7\u53d6\u5f53\u524d\u9875\u6570\uff0c\u9ed8\u8ba4\u4e3a\u7b2c1\u9875\n    per_page = 10  # \u6bcf\u9875\u663e\u793a\u7684\u7ed3\u679c\u6570\u91cf\n    offset = (int(page) - 1) * per_page  # \u8ba1\u7b97\u504f\u79fb\u91cf\n    online = session.get('Online', 0)\n    if query:\n        # \u641c\u7d22\u7f51\u9875\u5185\u5bb9\n        content_result = search_contents(tokenize(query), offset, per_page)\n\n        return render_template('index.html',\n                               query=query,\n                               content_result=content_result['results'],\n                               total_results=content_result['total_results'],  # \u663e\u793a\u641c\u7d22\u7ed3\u679c\u603b\u6570\n                               total_pages=content_result['total_pages'],\n                               current_page=int(page),\n                               online=online)\n    else:\n        return render_template('index.html',\n                               online=online)\n\n\nif __name__ == '__main__':\n    create_database()\n    app.secret_key = 'pyxueba'\n    app.run(debug=True)\n<\/code><\/pre>\n<p>\u641c\u7d22\u5f15\u64ce\u524d\u7aef\uff1a<\/p>\n<pre class=\"line-numbers\"><code class=\"language-python\">&lt;!DOCTYPE html&gt;\n&lt;html&gt;\n&lt;head&gt;\n    &lt;meta charset=&quot;UTF-8&quot;&gt;\n    &lt;title&gt;Python\u5b66\u9738\u641c\u7d22\u5f15\u64ce&lt;\/title&gt;\n    &lt;link rel=&quot;icon&quot; type=&quot;image\/svg+xml&quot; href=&quot;favicon.svg&quot;&gt;\n    &lt;script src=&quot;https:\/\/ajax.googleapis.com\/ajax\/libs\/jquery\/3.6.0\/jquery.min.js&quot;&gt;&lt;\/script&gt;\n    &lt;style&gt;\n        body {\n            font-family: Arial, sans-serif;\n            margin: 50px;\n        }\n\n        h1 {\n            font-size: 24px;\n            margin-bottom: 20px;\n            text-align: center;\n        }\n\n        .search-box {\n            margin-bottom: 20px;\n            text-align: center;\n        }\n\n        .search-box input[type=&quot;text&quot;] {\n            padding: 6px 2px;\n            font-size: 16px;\n            border-radius: 4px;\n            border: 1px solid #999;\n            width: 40%;\n            max-width: 100%;\n        }\n\n        .search-box button[type=&quot;submit&quot;] {\n            padding: 6px 12px;\n            font-size: 16px;\n            border-radius: 4px;\n            background-color: #006621;\n            color: #fff;\n            border: none;\n            cursor: pointer;\n        }\n\n        .search-box button[type=&quot;submit&quot;]:hover {\n            background-color: #00511a;\n        }\n\n        .result-item {\n            margin-bottom: 20px;\n            border: 1px solid #ddd;\n            border-radius: 4px;\n            padding: 10px;\n        }\n\n        a {\n            text-decoration: none;\n        }\n\n        .result-title {\n            font-size: 20px;\n            font-weight: bold;\n            text-align: left; \/* \u4fee\u6539\u6b64\u884c *\/\n        }\n\n        .result-title a {\n            color: #008000;\n        }\n\n        .result-url {\n            color: #000000;\n            font-size: 14px;\n            margin-bottom: 5px;\n        }\n\n        .result-time {\n            font-size: 14px;\n            color: #999;\n        }\n\n        .result-description {\n            margin-top: 10px;\n        }\n\n        .pagination {\n            margin-top: 20px;\n            text-align: center;\n        }\n\n        .pagination-link {\n            display: inline-block;\n            padding: 6px 12px;\n            margin-right: 5px;\n            color: #333;\n            border-radius: 4px;\n            background-color: #f5f5f5;\n            text-decoration: none;\n        }\n\n        .pagination-link:hover {\n            background-color: #ddd;\n        }\n\n        .highlight {\n            background-color: #FFD700;\n        }\n\n        .footer {\n            margin-top: 50px;\n            text-align: center;\n            color: #999;\n            font-size: 12px;\n        }\n\n        .visitor-count {\n            margin-top: 10px;\n        }\n\n        .visitor-count span {\n            margin-left: 5px;\n        }\n\n.favicon {\n  width: 16px;\n  height: 16px;\n  margin-right:3px;\n}\n&lt;\/style&gt;\n&lt;\/head&gt;\n\n&lt;body&gt;\n    &lt;h1&gt;python\u5b66\u9738\u5168\u6587\u641c\u7d22&lt;\/h1&gt;\n\n    &lt;div class=&quot;search-box&quot;&gt;\n        &lt;form action=&quot;\/&quot; method=&quot;get&quot;&gt;\n            &lt;input type=&quot;text&quot; name=&quot;q&quot; id=&quot;search-input&quot; list=&quot;suggestion-list&quot; placeholder=&quot;\u4f60\u8d1f\u8d23\u641c\uff0c\u6211\u8d1f\u8d23\u627e\u00b7\u00b7\u00b7&quot;&gt;\n            &lt;datalist id=&quot;suggestion-list--------&quot; class=&quot;suggestion-list------&quot;&gt;&lt;\/datalist&gt;\n            &lt;button type=&quot;submit&quot;&gt;\u641c\u7d22&lt;\/button&gt;\n        &lt;\/form&gt;\n    &lt;\/div&gt;\n\n    {% if content_result %}\n    &lt;p&gt;\u5171\u627e\u5230 {{ total_results }} \u6761\u7ed3\u679c\u3002&lt;\/p&gt;\n    {% for result in content_result %}\n    &lt;div class=&quot;search-summary&quot;&gt;\n    &lt;\/div&gt;\n    &lt;div class=&quot;result-item&quot;&gt;\n        &lt;h2 class=&quot;result-title&quot;&gt;&lt;img src=&quot;{{ result.favicon }}&quot; alt=&quot;Favicon&quot; class=&quot;favicon&quot;\n                style=&quot;border: 1px solid #ccc; border-radius: 5px;&quot; \/&gt;&lt;a class=&quot;result-link&quot; href=&quot;{{ result.url }}&quot;\n                target=&quot;_blank&quot;&gt;{{ result.title }}&lt;\/a&gt;&lt;\/h2&gt;\n        &lt;p class=&quot;result-url&quot;&gt;&lt;span class=&quot;time&quot;&gt;{{ result.date }}&lt;\/span&gt; {{ result.description }}&lt;\/p&gt;\n    &lt;\/div&gt;\n    {% endfor %}\n\n    &lt;div class=&quot;pagination&quot;&gt;\n        {% if total_pages &gt; 1 %}\n        {% for page in range(1, total_pages + 1) %}\n        {% if page == current_page %}\n        &lt;a class=&quot;pagination-link highlight&quot; href=&quot;\/?q={{ query }}&amp;amp;page={{ page }}&quot;&gt;{{ page }}&lt;\/a&gt;\n        {% else %}\n        &lt;a class=&quot;pagination-link&quot; href=&quot;\/?q={{ query }}&amp;amp;page={{ page }}&quot;&gt;{{ page }}&lt;\/a&gt;\n        {% endif %}\n        {% endfor %}\n        {% endif %}\n    &lt;\/div&gt;\n    {% endif %}\n\n    &lt;div class=&quot;footer&quot;&gt;\n       @2023 Python\u5b66\u9738.\n        &lt;div class=&quot;visitor-count&quot;&gt;\n            &lt;p&gt;\u603b\u8bbf\u95ee: {{ online }}&lt;\/p&gt;\n        &lt;\/div&gt;\n    &lt;\/div&gt;\n\n    &lt;script&gt;\n        \/\/ JavaScript \u53ef\u9009\uff0c\u7528\u4e8e\u7ed9\u641c\u7d22\u5173\u952e\u8bcd\u6dfb\u52a0\u9ad8\u4eae\u6837\u5f0f\n        window.onload = function () {\n            var query = &quot;{{ query }}&quot;;\n            var titles = document.getElementsByClassName(&quot;result-title&quot;);\n            for (var i = 0; i &lt; titles.length; i++) {\n                var title = titles[i];\n                var highlighted = title.innerHTML.replace(new RegExp(query, &quot;gi&quot;), '&lt;span class=&quot;highlight&quot;&gt;$&amp;&lt;\/span&gt;');\n                title.innerHTML = highlighted;\n            }\n        };\n&lt;\/script&gt;\n    &lt;script type=&quot;text\/javascript&quot;&gt;\n        $(document).ready(function () {\n            $('#search-input').on('input', function () {\n                var query = $(this).val();\n                if (query.trim().length &gt; 0) { \/\/ \u786e\u4fdd\u8f93\u5165\u4e0d\u662f\u7a7a\u767d\u5b57\u7b26\n                    $.ajax({\n                        url: '\/get_suggestions',\n                        data: { q: query },\n                        success: function (response) {\n                            var suggestions = response.suggestions;\n                            var suggestionList = $('#suggestion-list');\n                            suggestionList.empty(); \/\/ \u6e05\u7a7a\u4e4b\u524d\u7684\u5efa\u8bae\u5217\u8868\n                            for (var i = 0; i &lt; suggestions.length; i++) {\n                                var suggestionItem = $('&lt;li&gt;').text(suggestions[i]);\n                                suggestionList.append(suggestionItem);\n                            }\n                            suggestionList.show(); \/\/ \u663e\u793a\u5efa\u8bae\u5217\u8868\n                        }\n                    });\n                } else {\n                    $('#suggestion-list').empty().hide(); \/\/ \u8f93\u5165\u4e3a\u7a7a\u65f6\u9690\u85cf\u5efa\u8bae\u5217\u8868\n                }\n            });\n\n            \/\/ \u5f53\u7528\u6237\u70b9\u51fb\u5efa\u8bae\u9879\u65f6\u5c06\u5176\u586b\u5145\u5230\u641c\u7d22\u6846\u4e2d\n            $('#suggestion-list').on('click', 'li', function () {\n                var selectedSuggestion = $(this).text();\n                $('#search-input').val(selectedSuggestion);\n                $('#suggestion-list').empty().hide(); \/\/ \u586b\u5145\u540e\u9690\u85cf\u5efa\u8bae\u5217\u8868\n            });\n        });\n&lt;\/script&gt;\n&lt;\/body&gt;\n&lt;\/html&gt;\n<\/code><\/pre>\n<p>\u722c\u866b\uff1a<\/p>\n<pre class=\"line-numbers\"><code class=\"language-python\">import requests\nfrom bs4 import BeautifulSoup\nimport sqlite3\nimport jieba\nimport threading\nimport time\nimport random\nimport string\nimport re\nfrom datetime import date\nimport base64\nclass Crawler:\n    def get_image_data_uri(self,image_url):\n     # \u53d1\u8d77GET\u8bf7\u6c42\u83b7\u53d6\u56fe\u50cf\u6570\u636e\n     response = requests.get(image_url)\n     image_data = response.content\n\n     # \u5c06\u56fe\u50cf\u6570\u636e\u8f6c\u6362\u4e3abase64\u683c\u5f0f\n     base64_data = base64.b64encode(image_data).decode('utf-8')\n\n     # \u6784\u5efa\u5305\u542bbase64\u56fe\u50cf\u6570\u636e\u7684data URI\n     data_uri = f&quot;data:image\/x-icon;base64,{base64_data}&quot;\n\n     # \u8fd4\u56dedata URI\n     return data_uri\n    def __init__(self, max_depth=3, num_workers=10):\n        self.max_depth = max_depth\n        self.num_workers = num_workers\n        self.conn = sqlite3.connect('data.db', check_same_thread=False)\n        self.lock = threading.Lock()\n        self.url_queue = []\n        self.crawled_urls = set()\n\n        self.create_tables()\n        self.add_urls(['https:\/\/www.hao123.com\/'])\n        self.run()\n\n    def create_tables(self):\n        c = self.conn.cursor()\n        c.execute('''CREATE VIRTUAL TABLE IF NOT EXISTS contents USING fts5 (\n                    title ,\n                    url ,\n                    favicon ,\n                    description ,\n                    keywords ,\n                    date ,\n                    img )''')\n        self.conn.commit()\n\n    def add_urls(self, urls):\n        with self.lock:\n            self.url_queue.extend(urls)\n\n    def crawl_and_save(self, url, depth=0):\n        try:\n            headers = {\n                'User-Agent': 'Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/91.0.4472.124 Safari\/537.36'\n            }\n            if &quot;.ico&quot; not in url and &quot;.jpg&quot; not in url and &quot;.png&quot; not in url and &quot;javascript:;&quot; not in url and &quot;#&quot; not in url and &quot;javascript:void(0)&quot; not in url and &quot;javascript&quot; not in url and url != '':\n             response = requests.get(url, headers=headers, timeout=2.5)\n             response.raise_for_status()\n            else:\n             print(f&quot;\u65e0\u6548\uff1a{url} &quot;)\n             return\n        except (requests.exceptions.RequestException, requests.exceptions.HTTPError) as e:\n            print(f&quot;\u65e0\u6cd5\u83b7\u53d6\u94fe\u63a5 {url}\uff1a{e}&quot;)\n            return\n\n        content_type = response.headers.get('content-type')\n        if not content_type or not content_type.startswith('text\/html'):\n            return\n\n        raw_html = response.content\n        html_text = response.text\n        soup = BeautifulSoup(raw_html, 'html.parser')\n        title_tag = soup.title\n        title=&quot;&quot;\n        if title_tag is None:\n            print(f&quot;\u94fe\u63a5 {url} \u672a\u627e\u5230\u6807\u9898\uff0c\u8df3\u8fc7...&quot;)\n            return\n        if title_tag is not None:\n          title = title_tag.string.strip()\n        if not title:\n            print(f&quot;\u94fe\u63a5 {url} \u6807\u9898\u4e3a\u7a7a\uff0c\u8df3\u8fc7...&quot;)\n            return\n        title2 = &quot; &quot;.join(jieba.cut(title))\n        title2 = &quot;&quot;.join([char for char in title if char not in string.punctuation])  # \u53bb\u6389\u6807\u70b9\u7b26\u53f7\n\n        with self.lock:\n            if url in self.crawled_urls:\n                return\n\n            date_regex = re.compile(r'\\d{4}-\\d{2}-\\d{2}')  # \u5047\u8bbe\u65e5\u671f\u683c\u5f0f\u4e3aYYYY-MM-DD\n            date_match = date_regex.search(html_text)\n            if date_match:\n               shijian = date_match.group()\n            else:\n             # \u4f7f\u7528meta\u6807\u7b7e\u63d0\u53d6\u65e5\u671f\u4fe1\u606f\n               date_tag = soup.select_one('meta[name=&quot;date&quot;], meta[name=&quot;pubdate&quot;]')\n               shijian = date_tag.get('content') if date_tag else None\n\n               # \u5982\u679c\u65e5\u671f\u4e3a\u7a7a\uff0c\u4f7f\u7528\u5f53\u524d\u65e5\u671f\n            if not shijian or shijian.strip() == '':\n               shijian = str(date.today())\n            print(shijian)\n            try:\n                keywords = self.extract_keywords(title2)\n                description, favicon, img_urls = self.extract_page_info(soup)\n                if favicon:\n                 favicon=self.get_image_data_uri(favicon);\n                c = self.conn.cursor()\n                c.execute(\n                    &quot;INSERT INTO contents(title, url, favicon, description, keywords, date, img) VALUES (?, ?, ?, ?, ?, ?, ?)&quot;,\n                    (title, url, favicon, description, &quot;,&quot;.join(keywords), shijian, &quot;\\n&quot;.join(img_urls)))\n                self.conn.commit()\n                self.crawled_urls.add(url)\n                print(f&quot;\u6b63\u5728\u722c\u53d6 '{url}' \u5e76\u4fdd\u5b58\u5230\u6570\u636e\u5e93...&quot;)\n            except sqlite3.IntegrityError:\n                pass\n\n        if depth &lt; self.max_depth:\n            links = soup.find_all('a', href=True)\n            for link in links:\n                next_url = link['href']\n                if not next_url.startswith('http'):\n                    next_url = url + next_url\n                self.add_urls([next_url])  # \u6dfb\u52a0\u65b0\u7684URL\u5230\u961f\u5217\u4e2d\n\n    @staticmethod\n    def extract_keywords(title):\n     words = [word for word in jieba.cut(title) if word not in string.punctuation]  # \u5206\u8bcd\u5e76\u53bb\u6389\u6807\u70b9\u7b26\u53f7\n     keywords = [word for word in words if len(word) &gt; 0]  # \u53bb\u6389\u5355\u4e2a\u5b57\n     keywords = list(set(keywords))  # \u53bb\u91cd\n     keywords.sort(key=words.index)  # \u6309\u5728 title \u4e2d\u51fa\u73b0\u7684\u987a\u5e8f\u6392\u5e8f\n     #keywords = keywords[:10]  # \u53ea\u4fdd\u7559\u524d 10 \u4e2a\u5173\u952e\u8bcd\n     return keywords\n\n    @staticmethod\n    def extract_page_info(soup):\n        description = &quot;&quot;\n        favicon = &quot;&quot;\n        img_urls = []\n\n        meta_description = soup.find('meta', attrs={'name': 'description'})\n        if meta_description and meta_description.has_attr('content'):\n            description = meta_description['content']\n\n        link_favicon = soup.find('link', attrs={'rel': 'icon'})\n        if link_favicon and link_favicon.has_attr('href'):\n            favicon = link_favicon['href']\n\n        img_links = soup.find_all('img')\n        img_urls = [img.get('src') for img in img_links]\n        img_urls = [img for img in img_urls if img is not None]\n\n        return description, favicon, img_urls\n\n    def worker(self):\n        while True:\n            url = None\n            with self.lock:\n                if self.url_queue:\n                    url = self.url_queue.pop(0)\n\n            if url is None:\n                break\n\n            # \u6dfb\u52a0\u968f\u673a\u5ef6\u65f6\n            delay = random.uniform(1, 3)\n            time.sleep(delay)\n\n            self.crawl_and_save(url)\n\n    def run(self):\n        threads = []\n        for _ in range(self.num_workers):\n            t = threading.Thread(target=self.worker)\n            t.start()\n            threads.append(t)\n\n        for t in threads:\n            t.join()\n\n        self.conn.close()\n            #self.run()\n\n\nif __name__ == '__main__':\n    crawler = Crawler(max_depth=5, num_workers=5)\n<\/code><\/pre>\n<p>\u00a0 \u53ef\u80fd\u6709\u4e00\u4e9bbug\uff0c\u63d0\u793a\u8bcd\u529f\u80fd\u5df2\u7ecf\u52a0\u597d\u4e86\u9700\u8981html\u524d\u7aef\u4e2d\u66f4\u6539id\u3002<br \/>\n\u00a0<br \/>\n\u6765\u6e90&#8212;&#8212;python\u5b66\u9738<\/p>\n<!--CusAds0-->\n<div style=\"font-size: 0px; height: 0px; line-height: 0px; margin: 0; padding: 0; clear: both;\"><\/div>","protected":false},"excerpt":{"rendered":"<p>\u00a0 \u00a0\u4eca\u5929\u5b66\u4e60\u8fc7\u7a0b\u4e2d\u5199\u4e86\u4e00\u4e2aflask\u641c\u7d22\u5f15\u64ce\u5305\u62ec\u722c\u866b\u548c\u641c\u7d22\u9875\u9762\u3002\u529f\u80fd\u6709\u5168\u6587\u641c\u7d22\uff0c\u5206\u9875\uff0c\u722c\u866b\u7b49\u7b49\u3002<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"_bbp_topic_count":0,"_bbp_reply_count":0,"_bbp_total_topic_count":0,"_bbp_total_reply_count":0,"_bbp_voice_count":0,"_bbp_anonymous_reply_count":0,"_bbp_topic_count_hidden":0,"_bbp_reply_count_hidden":0,"_bbp_forum_subforum_count":0},"categories":[12],"tags":[],"_links":{"self":[{"href":"https:\/\/zhang.mba\/index.php\/wp-json\/wp\/v2\/posts\/1960"}],"collection":[{"href":"https:\/\/zhang.mba\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/zhang.mba\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/zhang.mba\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/zhang.mba\/index.php\/wp-json\/wp\/v2\/comments?post=1960"}],"version-history":[{"count":0,"href":"https:\/\/zhang.mba\/index.php\/wp-json\/wp\/v2\/posts\/1960\/revisions"}],"wp:attachment":[{"href":"https:\/\/zhang.mba\/index.php\/wp-json\/wp\/v2\/media?parent=1960"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/zhang.mba\/index.php\/wp-json\/wp\/v2\/categories?post=1960"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/zhang.mba\/index.php\/wp-json\/wp\/v2\/tags?post=1960"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}