mirror of
https://github.com/AndyTheFactory/newspaper4k.git
synced 2025-04-04 14:11:16 +08:00
* feat(lang): ⚡ Rework of tokenizer. Additionally implemented new (easier) way of adding languages to the packet * feat(lang): added language dependecies as optional * feat(lang): add Bengali, Nepali, Tamil, Georgian, Marathi, Telugu, Latvian, Czech, Slovak, Burmese language support * refactor(lang): moving all language related files in languages folder * refactor(lang): added valid_languages function that returns available languages * refactor(misc): ⚡ removed ParsingCandidate, RawHelper, URLHelper classes. Removed link_hash from article (was never used) * refactor(parse): article.link_hash is no longer available * fix(cli): json output in stdout missing [] * feat(parse): 🔥 article is now pickleable * feat(parse): 🔥 Source object is now pickleable * refactor(parse): ✨ Tidying up the gravity scoring process. No changes in the final score result * refactor(parse): 🚀 compute word statistics for a node taking children nodes into account * fix(parse): ⚡ Bug with auto detecting website language. If no language supplied, the detected language was not used * fix(parse): ⚡ added figure as a tag to be removed before text generation * fix(parse): 🔥 better article paragraph detection * fix(parse): ⚡ get feeds fixed, it was not parsing the main page for possible feeds * fix(misc): ✨ tydiing up some code in urls.py * feat(misc): better typing support and type hinting Author: Tom Parker-Shemilt <palfrey@***.net> * feat(misc): Simplify favicon return Author: Tom Parker-Shemilt <palfrey@***.net> * feat(misc): Basic mypy support Author: Tom Parker-Shemilt <palfrey@***.net> * feat(doc): 📝 adding evaluation results * feat(doc): 🚀 Documentation Update. Added Examples, documented new features * refactor(core): Minimum Python now 3.8; Also test 3.10/11/12 Author: Tom Parker-Shemilt <palfrey@***.net> * refactor(core): run gh actions on PR's. Author: Tom Parker-Shemilt <palfrey@***.net> * refactor(core): Set SETUPTOOLS_USE_DISTUTILS. setuptools as per numpy recommendations. Upgrade numpy and pandas for >= 3.9.Author: Tom Parker-Shemilt <palfrey@***.net> * refactor(core): Upgrade regex, virtualenv to avoid breaking pre-commit, distutils for everyone. Author: Tom Parker-Shemilt <palfrey@***.net> * feat(sources): ✨ new option when building sources. You can limit the article parsing to the source home page only. Other categories or feeds are then ignored * feat(misc): 📈 added cloudscraper as optional dependancy. If installed, it will us cloudscraper as a layer over requests. Cloudscraper tries to bypass cloudflair protection * feat(lang): ✨ New integration of Google news using GNews module. You can now use GoogleNewsSource to search and parse news based on keywords, topic, location or website * fix(parse): ⚡ Better title parsing. Added language specific regex for article titles * feat(parse): ✨ added filter that limits the source.build to a specific category. use source.build(url,only_in_path=True) to scrape only stories that are in the starting url path * fix(parse): 🔥 better binary content detection * fix(lang): ⚡ better is_highlink_density for non-latin languages * feat(lang): 📝 added stopwords for af, br, ca,eo, eu, ga, gl, gu, ha, hy, ku, ms, so, st, tl, ur, yo, zu from https://github.com/stopwords-iso * refactor(parse): 💥 deprecated text_cleaned, clean_doc. Removed clean_top_node, article.clean_top_node is removed. Failtures if it was accessed * feat(lang): 🚀 added support for another 13 languages * fix(misc): 🎨 mypy stubs for gnews and cloudscraper + small typing fixes * fix(parse): 🐛 better feed discovery in Souce objects * fix(parse): 🐛 fixed an issue with non latin high density detection * docs(doc): 🔥 Added typing and docstrings to most of the code * fix(types): 🎨 added stubs for gnews * fix(misc): 🚑 python-setup github action version bump Co-authored-by: Tom Parker-Shemilt <palfrey@tevp.net>
67 lines
1.6 KiB
Python
Executable File
67 lines
1.6 KiB
Python
Executable File
#!/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Andrei Paraschiv-- https://github.com/AndyTheFactory
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import codecs
|
|
|
|
|
|
try:
|
|
from setuptools import setup
|
|
except ImportError:
|
|
from distutils.core import setup
|
|
|
|
|
|
packages = [
|
|
"newspaper",
|
|
]
|
|
|
|
|
|
if sys.argv[-1] == "publish":
|
|
os.system("python3 setup.py sdist upload -r pypi")
|
|
sys.exit()
|
|
|
|
if sys.version_info < (3, 8):
|
|
sys.exit("Sorry, Python < 3.8 is not supported")
|
|
|
|
|
|
with open("requirements.txt", encoding="utf-8") as f:
|
|
required_packages = f.read().splitlines()
|
|
|
|
|
|
with codecs.open("README.rst", "r", "utf-8") as f:
|
|
readme = f.read()
|
|
|
|
|
|
setup(
|
|
name="newspaper4k",
|
|
version="0.9.0",
|
|
description="Simplified python article discovery & extraction.",
|
|
long_description=readme,
|
|
author="Andrei Paraschiv",
|
|
author_email="andrei@thephpfactory.com",
|
|
url="https://github.com/AndyTheFactory/newspaper4k",
|
|
packages=packages,
|
|
python_requires=">=3.8",
|
|
include_package_data=True,
|
|
install_requires=required_packages,
|
|
license="MIT",
|
|
zip_safe=False,
|
|
classifiers=[
|
|
"Development Status :: 4 - Beta",
|
|
"Programming Language :: Python :: 3",
|
|
"Natural Language :: English",
|
|
"Topic :: Text Processing",
|
|
"Topic :: Text Processing :: Markup :: HTML",
|
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
"Intended Audience :: Developers",
|
|
"Intended Audience :: Education",
|
|
"Intended Audience :: Information Technology",
|
|
"Intended Audience :: Science/Research",
|
|
"License :: OSI Approved :: MIT License",
|
|
],
|
|
)
|