From 8239fdb8f3d121cd0f312d808dbb08097b30dd09 Mon Sep 17 00:00:00 2001 From: Yuvi63771 <114073886+Yuvi9587@users.noreply.github.com> Date: Wed, 8 Oct 2025 17:02:46 +0530 Subject: [PATCH] Commit --- LinkMaker/hentai2read.py | 0 assets/Ko-fi.png | Bin 0 -> 3019 bytes assets/buymeacoffee.png | Bin 0 -> 3276 bytes assets/patreon.png | Bin 0 -> 978 bytes src/config/constants.py | 9 +- src/core/Hentai2read_client.py | 245 ++- src/core/allcomic_client.py | 116 ++ src/core/api_client.py | 6 +- src/core/booru_client.py | 375 ++++ src/core/bunkr_client.py | 28 +- src/core/fap_nation_client.py | 125 ++ src/core/mangadex_client.py | 189 ++ src/core/pixeldrain_client.py | 93 + src/core/simpcity_client.py | 100 + src/core/toonily_client.py | 73 + src/core/workers.py | 1089 ++++++----- src/i18n/translator.py | 2 +- src/services/drive_downloader.py | 737 ++++++-- src/services/multipart_downloader.py | 2 +- src/services/updater.py | 7 +- src/ui/dialogs/CookieHelpDialog.py | 3 - src/ui/dialogs/CustomFilenameDialog.py | 89 + src/ui/dialogs/ErrorFilesDialog.py | 190 +- src/ui/dialogs/ExportLinksDialog.py | 226 +++ src/ui/dialogs/FutureSettingsDialog.py | 53 +- src/ui/dialogs/HelpGuideDialog.py | 2 +- src/ui/dialogs/SinglePDF.py | 8 +- src/ui/dialogs/SupportDialog.py | 276 +-- src/ui/dialogs/TourDialog.py | 177 +- src/ui/main_window.py | 2364 +++++++++++++++++++----- src/utils/command.py | 49 + src/utils/file_utils.py | 4 +- src/utils/network_utils.py | 42 +- src/utils/resolution.py | 152 +- src/utils/text_utils.py | 17 +- yt-dlp.exe | Bin 0 -> 18331067 bytes 36 files changed, 5380 insertions(+), 1468 deletions(-) create mode 100644 LinkMaker/hentai2read.py create mode 100644 assets/Ko-fi.png create mode 100644 assets/buymeacoffee.png create mode 100644 assets/patreon.png create mode 100644 src/core/allcomic_client.py create mode 100644 src/core/booru_client.py create mode 100644 src/core/fap_nation_client.py create mode 100644 src/core/mangadex_client.py create mode 100644 src/core/pixeldrain_client.py create mode 100644 src/core/simpcity_client.py create mode 100644 src/core/toonily_client.py create mode 100644 src/ui/dialogs/CustomFilenameDialog.py create mode 100644 src/ui/dialogs/ExportLinksDialog.py create mode 100644 src/utils/command.py create mode 100644 yt-dlp.exe diff --git a/LinkMaker/hentai2read.py b/LinkMaker/hentai2read.py new file mode 100644 index 0000000..e69de29 diff --git a/assets/Ko-fi.png b/assets/Ko-fi.png new file mode 100644 index 0000000000000000000000000000000000000000..d2172ada56f1850a8cb6da15b5ab8a2386787630 GIT binary patch literal 3019 zcmV;+3pDhJP)PW zI$P|Z-z;utD)c9MZ%g;jsW?iHhssQ@-~p4Hp-N-IQxxw&p{Jy{jS3aD)8?W!YJ&#g z7R^G%V24{xuFw-E7t>Yh4uag`OWJ8Do*X-UrN~b2LjM-cf|_RB!9At!AeJ`qC~ad| z+D2kUZPatitbMm@<};;ErlQo%^e9DqQ5(g_9!vL9JF!JG-}pg0tu?tBn}Wr|_!PF1 z4Mlcpv1UGCax>daZiZHr__pcAFKr!5;W08)V5fepnX9Zl0pBWhGy6&>EG3mMe(CtM zkZfTKd7!`w-_gt^icW{_GP#1M=dAc+C?t9n@M$4WmDqxJYUToEZN$AMXYj0|#oGmn zA1eS9wh$K;e_b%02pLRHrgwgcCu1pSA$tp&aDzsPm^#Qsy!?479=J4;Lanc$M&L_mHcezNRC_;$Me zYfMi1t=SV#r(gWIsp1L9v*EGi+h%DO747F6JLsM{DZWK| z@hG`A9CB=d%ld<}aNJqqp!TTZ@tAln4Sez%@ZGY?M@M%?2ZIR}Uw$$O&cHByGs1#W zxyb?$HzM%g&YtQY0mIW2L}PY50HNPZ}+T=9vxag&9E zU4yXFP6`u0nw)x+z~l1R2e_rgL3Ih8YS74^>s%jU;kRe{1&ZgBUGKjvOSgrwgIOzP zj#u)uBbM6UK`5{#7N1ie7>gBOC3yi#+UbwQTwLcHwY`Hgi$BW8Q0MIQnt7hJD6=^my`36;Rp-m4swe4iSh%GU5CECvV)U#@Ix_&Jzq+x&}GPiE#Ud5{n;=q0axhbOmUq&WXK&5(66^hQX#rV1B&^ z$eoV@^|PNs@DIO-p-)c%u;YZVEZjJC5`w>b0mz>`2E>kSKtJ;|TyLlcHXIi9obgsQ zJKQt`1|xAFhw8Jsm53`onk=jT3rXao_U>Xk#frUi6-tC6fq8B>5RX*w*}fHsZCm(M zY=-_<_5wRJ#3i)%WxzL=^V#$;pN$&;FJBMzuXaI%B!w4Xx+Uf4{QiiGPWVdLcYYwQ zcms;n4+g6*lGlLUo%-l+qcl9>~2BNooG3UVr<0YK6daXn^3%=mXi;e zwZ6AgEC_19-(|fd!aeeDuOjCfH6cyx#G>1G=7G5PD(@Of}G;2R$nUVsG4$;aIBr=XsJ zk%Z!7sqvjly#TG`*U2SPB%aHfl`!!3TY{cD(K>#+lY(%9W$3G&0|~`v z)%YN@#s?WSK2nNTz;H5cd_oxM=}BsypFHZ3_-IDDyM-5Ensxj(Ck27Qn+YFVZYL9p zk0#@YZ%B$)z;H6XH9`rYn=6s?{1ox4S3qFZ3JCq_MZwQIUV4cQhsTc7V7_ua;mAjv z^bO4@KC_ySBn2pJA!zYUO@v?(H*uW57>k(aWAc%BPX2NrHgANRfdI$}D$S3a)QsX| z1*n!_3Mye| z0f1$p|IPh^rkkjE|B447^vYi+?wHEJY5x!y>jQ$t$5Q1>{YEUmg?LAd_@oPfU?b4K z@6`n2dHG1ZfB6Ft+WQxPNc^qiR6*C{LzljR>{^lHxp*O?3OkT;0ScP%XT*q)HZ>&; z%fhu!{xNl)kB<7~NdEoM|Hgh<<^MA<1QqRMlH$Fvq{<8WD(tD03sBfhm5UJ{ZDvZ9 z!R98w(R4IPd|>q|xZY?3soPJ`4(DnAF#PRoKdd`Oq$EBXodsK;ashIhz27TnLVfV! z+W4d({N+U;c5LV6FT0;RHwa(6Aj$WhxQ_RWy133Ho(~pTR4KuM3?jdgI48D$QYsHfH5E;z2PT!|1}MRIY_t8O3`+Z}u{V1DB=!5UAWn{39`~+UFQ?Z6dSD z7gSgt@fJQhGxmW-YVsS2HDX$|_nsX9rSj65ep$rpEf^QwVed-G3xHa#a~lFzgtlrg zwGr@DNLY)=Z1N=&hMXF|EaEx&7SAQwJ`6C=7T6`EM?QMkz1mK|SHTfQ$zN{6WfG5} zsRwjs?=LlSf!>;*(|}(U)FVH7=33Bz!^@om&`F2Wa$&wc!oug|F#P#U|D=&GwRjYd zyl+uLyie_UTpoj*}iQ{n^fnGZu`o8za7k0td+teHT4?>R>UU8;C& zQaqO=UwXzP_b9$mk>ab3FCpaA`5RR67@tZ{t!B2#vSC`o(Ez=m#hW?# zr1r?wMVfSwUGLwTZS}JYQoNay&uS~Z>G`;tD26)!{q5Jf3R0sALN>Qgj|$m_4TAheAO(8 zMABm)SMD;M)6QOi)X85KCsx@ci)mKZ{6Gwk8yiDY1>|q6a8WTx{NCCSCWah zNG%?_ve?|WNi&}@*e~CjQRUsOKkV&Oig>Nra}^z?%-Vjw5VLy7*Xlys(fX73wA(n!AQBqYb&vk){L2TmDs}muMDR{f3Wv9eWhoM&g|K* zGkZF97EeTiQ6ABnJzr?ep4ZW-)q(-eu zT3d+?qo7}--{bxa{T(y*_TozHDfC#$TEtaUR8&+{R8&+{R8%z5@P7ox-wp_Nw}$`# N002ovPDHLkV1nGz-6#M6 literal 0 HcmV?d00001 diff --git a/assets/buymeacoffee.png b/assets/buymeacoffee.png new file mode 100644 index 0000000000000000000000000000000000000000..ce9988bf36d2a23f2d9a0624f13fc9fbe3160028 GIT binary patch literal 3276 zcmV;-3^VhIP)*p%E>*9*ndVQ|4fI>`H#EaB^ zuak)Zy5~qb(-Z99_x=9Kba$$Luc}_vt5+pSE?l^9;lhOr7cN}5aN)v*3l}b2xNzaZ zg$oxpf)I8tA^yueBrpUK|0ImrZ5Y#~h|qPIu$>swXMl*%J&4d(ftaxOn-USW4->Y= z=j(H})AdPEq@R(LoY@^?Ivx=gM1(z#3H>)Flw-n#T||WLk))A7k$&tzsQ;N5(?JMR z1!49NgxNQC3XQ{%5bAGEnUj1h6%qPyL$7+)L+I@Y1$ws`gZg}ZerXILg#9_5g5wi= z0Sa6dYuVei%l<9G^uOlcq}4DKdb})KlBCn5L<1pLbTEb$%nmyxIu41C>E~yk-RrDa z>wqyUi0Xe}_AbVBCL!s+k`e(Jvl(`ebcnpY41 zX|?Ptgy~xdv)zcW`w^is#%v)%{%Ib<#u5_9f$mR8W_pMoPe$ja#5gmhCBuW)>u--R ztBtYj-(t+hNlpMEHX&xA2|I`}dm0nE(nIJq7}K5}PZp9A1cU@~pk>F}crd12;$B#U z=_gSX@Bqea8yE{(nAAq#^|DK0oJWK{8O`vD2-_0Z0wyOjUzlGX!Vak#Ou77_Lc z%q|e}caqu`P6tPi2%CopeGUYUFbvhjy@VLEd@f#xa0(q zlGv{iW|b|kK=izzQ;xwdod=3uU51FBU9N8Y+j%Md<5rFn8q$%FCg+^f_cs``k@agN z`Y6P~Fkvra%ruNy8OCg2YU;qB+iD1Cu`TTlZJEC0tRop|!#iU_??ae=9mP8};M8rO zsKCOiz7R35C=;ID&L1S^44z^7TWU&w(ZQD$^EVM@U%*OVvK!FFU>#sMHOyWkgxN68 zvbYii9p0xv$YU3XJ@f7nx~vNDDXs9o>R|H;W?oWjfGg^(&z4oxJvZ}iv96#*6y;14 z{diqW!Co9ysygb5 zcI?E8g!t*QoyIJsYHCod8CPrwFs1(#k)Ls`81(zGA|q|MNJ||g+9$`}y*qcv7MEUd zl?Zgp6Ib`j7vt$9adXaeQ8H|n*jTt&JbqJ|czyn6acJekre!nZ-i7zIW<#ih_?WE* zs8LiJZRMinXUldP{AB44v9EZGc&qq<#=jk05xtnr;cK_w8{zw`{D?* zcdUmZw%j%Y)CRRaPK(BEE2`-N4Kg>v7~#vVUAz9w9s`u^>glv- z+(tffW2vS1=XV@n42#FTBWsNdG}Dl`#c%}s5>W{ZDJGb(y|6cc2wMv)XA8Hga`-v84h2 zK4;TO&+D@GN%#iL##uPy%E+ysuxHJtQb|b+HyS|JE_PZpZX*{Bn{8=;11lafF@X9( z_(tsSW%0e2BliG{*iX}}gqqS_7db5&w~^EPPqj3_L9_LQti2shLrBN8=mD=q?g5yp zrZ0Q_qeIVB2enkEMdLPdRNq2N1N?LTX0vzphS!kgnSYNy8ch#aYd(*qw}zfMx=QQh zv}oK$W_HW7G(d37{F9#76*X*2hQOt{6J~288UXSpEnW_Wo~d1}dYu-H+enBRSs9@F z`B$A3*3cF8CCdhkSVzmyHKxNO8USN|q4~>fN9dW)%bz&QY0^6L z>^4^~{fvgaW|Q%sehr&Ck$ZrbWm|mUh0rsixcGEkR_mM=kK4e%&Ds#r03jnZI%HYf zaguJHEMz5qL+jrZQTzz0DaTE{bXl!+T0Cw8w@z9Xl>tJVH>G11h^LBHS3q77Bnv~D zdqX0_msan=MwZW)l^m4;jJ;2D5B@!*sfV63Hp&w~K;DXL0NaS!ZIT`7i7>zY!2sGm zr^VwoFz@p5ttvhubZboF8wMck33E&O3mXlfsLweq9&aIGI}(CzW_5)P9~%r%0ck!l z8vwGLEgZ?}COx3SX#sf)`L6}$J7JwVWE&0_HeZ@2JpB&R@SB}OYh;s|8{qXHdVsdZ zX#sgFML`Gmyr%J)7iVuUwmzdD@98rLu}h?u06ezEKR6cJTH}hkz-a+_3t1Sr&Q!z& zLuQ`zEQBOhj=o(C>r()6v3KiJbZ9jq>`gcs3W>`Nhe_-K91F7W=y~wFUZ#z6T0q_c z9xko4%;0$P=2h13%IZPeL~dR@qu#53SzguGX%Tq~2qK%+uX9SaVH|4%XwS#T05~wh z)Bu=$0ZVSZOhZnK$Xmdv79nUza|!pJE^F)Kn4n9Knh!&P5%3I`HQi|;c{8|wd4)04 zF{>LSaD_L3S|r5_n9#>8{J_)jyDn=lIxQw|1|eC@tX>`4_cMZX5qdyY{0v~ZvB2sO z%Dk8>>MEzj&Ju{ zkSQB>xV4RVJ3Dqr(u=jB3 zBL|%pmPbj*;I?Z0+H@u2`!%v^tbr1#7lZ#-i-;$jMX)M+(ZK1^eXC5HBsq<6Cnq6y9+F(07M!1u_kT&Mb;g4~ZFp1%Ibi#> z`tHZtIjz5h(B&SuvcRfB(47~Lm~DCtDBj#uxY%;F3xwY+lCGQ!KSTqMho*(wfCfUM z(+jiKJ0kpl7Hty+{)v`l0}^jQJb2x*sJ{ux$WkJ~xPK>#yW>V*F+u!w*1EU||4zwP zF)w$95l#qKCg`R2OjsPtx1W&`5pb4;UvPl`#vAP;zGXJYy({_WwehfBOw9Vlpt?T_GhpjEpc$t70(%4l|R2 zyCE*SFxQy9gma>?aA5jq<1BEDJ?x8`@+Xp{wgR{wGR$Jl5w`bbiX|iFi0WWftS$c- z4^|D5+8XL_6ruBd$yqf{3h(s>#PwN|jbW=fzC~BGJEXP@$eAq4nf|Bj9wXx_d|K)t zF?a9`&IRA&M*PIm??U;IdjAN@YJ1tTfgIYscLo<_x~Ym<`cxBgBQBa>5QL;F!}d%oPJJ z8*OB(m0z<^JXKUK-e0`UwkfTyXq!bvMccT{J66;xyM`K9jw#_qfOHU;a=u%#ReZE$ zyZHe*n+ewr=s|5J^jPUsMs4uXWL?oJTU*c=Tm2q5bT0jvpqK6H2WPAt6<$_9;EFck zlpS(nxN&7$yiVFcMgy0%zv!|yN^ID0hST*=7>K)ep9!MqvUaa7YaiMyd<|E$hq$7S zuPNPiuG4jG6R5uXv35q@X_+=jSJb;;D-iNAby<5A?hw@#b-y93tR99t8@Qsr1;0Vm zQkS(YT+!y#%2m15<(22167zW%E?l^9;lhOr7cN}5aN$B~%lJPT`y^z%CMo~`0000< KMNUMnLSTYNpd*q1 literal 0 HcmV?d00001 diff --git a/assets/patreon.png b/assets/patreon.png new file mode 100644 index 0000000000000000000000000000000000000000..2418803ae2d38da368ccad7ba921a5aeb8ebe174 GIT binary patch literal 978 zcmeAS@N?(olHy`uVBq!ia0vp^2_VeD1|%QND7OGooCO|{#S9GG!XV7ZFl&wk0|Rr5 zr;B4q#hkZuPWy>CinN_~DqP>`QP4SkgRI}#9kYByo3s}zW)wU<)Dd{FRc+T7MwbVv zCU%T*O1o4X=O4(JvGUcCXo=!96-UPctCIPz=Kc5fbDH6;_W!zRNch|)4xp__AVI## z_-&ll^z$Llh3n?5XU(3r_p$iFk5vVKj#__?^{Sh8zf^AWbCWYYd)m4@beLxK^;aJM zd~cTd-nwOLKV94Nt?&7hy(=^_xIJDyODWj%WUc2Vw*B250y6q)+su46E>>L-yHe)c zwkmy#Z|tIf^pu3&|0w)@=!9UGC@cw<|ELW+_+IHz?B$P{T@&}PZiy+?I=%Ye z{iAD-d`f+l&Uo-s>H8n<{~qS|vizGFvCs9dhQh|QFzp|*N;2mI?Kv}czM2wKd+XqT zOQAEWuhxmO%$ddU+k5%M{>&AhI5zAMye!MeYaDz- z(BNG7iVsXVr2^fBT}&)5gY@PwoYr3AsKz97K9l`c_SQQ$ar(>GP zt%KhJ{5`tYu&5l(^v+xPt)E{R*U4yDEB)!<7&p?@;}?ItnJ)r z{`{1u>?O5q#-=$wo$GA(2JZDRoY2moSa@}IV2ghF%W3%mI!{y;<|%unO-oYp2Pu2$ z^Y@cj!cNJv*ZY<+EVPWXo%;NZN1C;fCPl6H4xv^)z4*}Q$iB}qyn#A literal 0 HcmV?d00001 diff --git a/src/config/constants.py b/src/config/constants.py index 1d49752..30aebf7 100644 --- a/src/config/constants.py +++ b/src/config/constants.py @@ -47,6 +47,8 @@ MAX_PARTS_FOR_MULTIPART_DOWNLOAD = 15 # --- UI and Settings Keys (for QSettings) --- TOUR_SHOWN_KEY = "neverShowTourAgainV19" MANGA_FILENAME_STYLE_KEY = "mangaFilenameStyleV1" +MANGA_CUSTOM_FORMAT_KEY = "mangaCustomFormatV1" +MANGA_CUSTOM_DATE_FORMAT_KEY = "mangaCustomDateFormatV1" SKIP_WORDS_SCOPE_KEY = "skipWordsScopeV1" ALLOW_MULTIPART_DOWNLOAD_KEY = "allowMultipartDownloadV1" USE_COOKIE_KEY = "useCookieV1" @@ -59,6 +61,8 @@ DOWNLOAD_LOCATION_KEY = "downloadLocationV1" RESOLUTION_KEY = "window_resolution" UI_SCALE_KEY = "ui_scale_factor" SAVE_CREATOR_JSON_KEY = "saveCreatorJsonProfile" +DATE_PREFIX_FORMAT_KEY = "datePrefixFormatV1" +AUTO_RETRY_ON_FINISH_KEY = "auto_retry_on_finish" FETCH_FIRST_KEY = "fetchAllPostsFirst" DISCORD_TOKEN_KEY = "discord/token" @@ -84,7 +88,7 @@ VIDEO_EXTENSIONS = { '.mpg', '.m4v', '.3gp', '.ogv', '.ts', '.vob' } ARCHIVE_EXTENSIONS = { - '.zip', '.rar', '.7z', '.tar', '.gz', '.bz2' + '.zip', '.rar', '.7z', '.tar', '.gz', '.bz2', '.bin' } AUDIO_EXTENSIONS = { '.mp3', '.wav', '.aac', '.flac', '.ogg', '.wma', '.m4a', '.opus', @@ -122,4 +126,5 @@ CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = { # --- Duplicate Handling Modes --- DUPLICATE_HANDLING_HASH = "hash" -DUPLICATE_HANDLING_KEEP_ALL = "keep_all" \ No newline at end of file +DUPLICATE_HANDLING_KEEP_ALL = "keep_all" +STYLE_CUSTOM = "custom" \ No newline at end of file diff --git a/src/core/Hentai2read_client.py b/src/core/Hentai2read_client.py index 38bbac5..a144276 100644 --- a/src/core/Hentai2read_client.py +++ b/src/core/Hentai2read_client.py @@ -2,71 +2,206 @@ import re import os -import json -import requests +import time import cloudscraper from bs4 import BeautifulSoup +from urllib.parse import urljoin +from concurrent.futures import ThreadPoolExecutor +import queue -def fetch_hentai2read_data(url, logger, session): +def run_hentai2read_download(start_url, output_dir, progress_callback, overall_progress_callback, check_pause_func): """ - Scrapes a SINGLE Hentai2Read chapter page using a provided session. + Orchestrates the download process using a producer-consumer model. + The main thread scrapes image URLs and puts them in a queue. + A pool of worker threads consumes from the queue to download images concurrently. """ - logger(f"Attempting to fetch chapter data from: {url}") + scraper = cloudscraper.create_scraper() try: - response = session.get(url, timeout=30) - response.raise_for_status() + progress_callback(" [Hentai2Read] Scraping series page for all metadata...") + top_level_folder_name, chapters_to_process = _get_series_metadata(start_url, progress_callback, scraper) + + if not chapters_to_process: + progress_callback("❌ No chapters found to download. Aborting.") + return 0, 0 - page_content_text = response.text - soup = BeautifulSoup(page_content_text, 'html.parser') - - album_title = "" - title_tags = soup.select('span[itemprop="name"]') - if title_tags: - album_title = title_tags[-1].text.strip() + total_chapters = len(chapters_to_process) + overall_progress_callback(total_chapters, 0) - if not album_title: - title_tag = soup.select_one('h1.title') - if title_tag: - album_title = title_tag.text.strip() + total_downloaded_count = 0 + total_skipped_count = 0 - if not album_title: - logger("❌ Could not find album title on page.") - return None, None + for idx, chapter in enumerate(chapters_to_process): + if check_pause_func(): break + + progress_callback(f"\n-- Processing and Downloading Chapter {idx + 1}/{total_chapters}: '{chapter['title']}' --") + + series_folder = re.sub(r'[\\/*?:"<>|]', "", top_level_folder_name).strip() + chapter_folder = re.sub(r'[\\/*?:"<>|]', "", chapter['title']).strip() + final_save_path = os.path.join(output_dir, series_folder, chapter_folder) + os.makedirs(final_save_path, exist_ok=True) + + # This function now scrapes and downloads simultaneously + dl_count, skip_count = _process_and_download_chapter( + chapter_url=chapter['url'], + save_path=final_save_path, + scraper=scraper, + progress_callback=progress_callback, + check_pause_func=check_pause_func + ) + + total_downloaded_count += dl_count + total_skipped_count += skip_count + + overall_progress_callback(total_chapters, idx + 1) + if check_pause_func(): break - image_urls = [] - try: - start_index = page_content_text.index("'images' : ") + len("'images' : ") - end_index = page_content_text.index(",\n", start_index) - images_json_str = page_content_text[start_index:end_index] - image_paths = json.loads(images_json_str) - image_urls = ["https://hentaicdn.com/hentai" + part for part in image_paths] - except (ValueError, json.JSONDecodeError): - logger("❌ Could not find or parse image JSON data for this chapter.") - return None, None + return total_downloaded_count, total_skipped_count - if not image_urls: - logger("❌ No image URLs found for this chapter.") - return None, None - - logger(f" Found {len(image_urls)} images for album '{album_title}'.") - - files_to_download = [] - for i, img_url in enumerate(image_urls): - page_num = i + 1 - extension = os.path.splitext(img_url)[1].split('?')[0] - if not extension: extension = ".jpg" - filename = f"{page_num:03d}{extension}" - files_to_download.append({'url': img_url, 'filename': filename}) - - return album_title, files_to_download - - except requests.exceptions.HTTPError as e: - if e.response.status_code == 404: - logger(f" Chapter not found (404 Error). This likely marks the end of the series.") - else: - logger(f"❌ An HTTP error occurred: {e}") - return None, None except Exception as e: - logger(f"❌ An unexpected error occurred while fetching data: {e}") - return None, None + progress_callback(f"❌ A critical error occurred in the Hentai2Read client: {e}") + return 0, 0 + +def _get_series_metadata(start_url, progress_callback, scraper): + """ + Scrapes the main series page to get the Artist Name, Series Title, and chapter list. + """ + try: + response = scraper.get(start_url, timeout=30) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + + series_title = "Unknown Series" + artist_name = None + metadata_list = soup.select_one("ul.list.list-simple-mini") + + if metadata_list: + first_li = metadata_list.find('li', recursive=False) + if first_li and not first_li.find('a'): + series_title = first_li.get_text(strip=True) + + for b_tag in metadata_list.find_all('b'): + label = b_tag.get_text(strip=True) + if label in ("Artist", "Author"): + a_tag = b_tag.find_next_sibling('a') + if a_tag: + artist_name = a_tag.get_text(strip=True) + if label == "Artist": + break + + top_level_folder_name = artist_name if artist_name else series_title + + chapter_links = soup.select("div.media a.pull-left.font-w600") + if not chapter_links: + chapters_to_process = [{'url': start_url, 'title': series_title}] + else: + chapters_to_process = [ + {'url': urljoin(start_url, link['href']), 'title': " ".join(link.stripped_strings)} + for link in chapter_links + ] + chapters_to_process.reverse() + + progress_callback(f" [Hentai2Read] ✅ Found Artist/Series: '{top_level_folder_name}'") + progress_callback(f" [Hentai2Read] ✅ Found {len(chapters_to_process)} chapters to process.") + + return top_level_folder_name, chapters_to_process + + except Exception as e: + progress_callback(f" [Hentai2Read] ❌ Error getting series metadata: {e}") + return "Unknown Series", [] + +### NEW: This function contains the pipeline logic ### +def _process_and_download_chapter(chapter_url, save_path, scraper, progress_callback, check_pause_func): + """ + Uses a producer-consumer pattern to download a chapter. + The main thread (producer) scrapes URLs one by one. + Worker threads (consumers) download the URLs as they are found. + """ + task_queue = queue.Queue() + num_download_threads = 8 + + # These will be updated by the worker threads + download_stats = {'downloaded': 0, 'skipped': 0} + + def downloader_worker(): + """The function that each download thread will run.""" + # Create a unique session for each thread to avoid conflicts + worker_scraper = cloudscraper.create_scraper() + while True: + try: + # Get a task from the queue + task = task_queue.get() + # The sentinel value to signal the end + if task is None: + break + + filepath, img_url = task + if os.path.exists(filepath): + progress_callback(f" -> Skip: '{os.path.basename(filepath)}'") + download_stats['skipped'] += 1 + else: + progress_callback(f" Downloading: '{os.path.basename(filepath)}'...") + response = worker_scraper.get(img_url, stream=True, timeout=60, headers={'Referer': chapter_url}) + response.raise_for_status() + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + download_stats['downloaded'] += 1 + except Exception as e: + progress_callback(f" ❌ Download failed for task. Error: {e}") + download_stats['skipped'] += 1 + finally: + task_queue.task_done() + + # --- Start the downloader threads --- + executor = ThreadPoolExecutor(max_workers=num_download_threads, thread_name_prefix='H2R_Downloader') + for _ in range(num_download_threads): + executor.submit(downloader_worker) + + # --- Main thread acts as the scraper (producer) --- + page_number = 1 + while True: + if check_pause_func(): break + if page_number > 300: # Safety break + progress_callback(" [Hentai2Read] ⚠️ Safety break: Reached 300 pages.") + break + + page_url_to_check = f"{chapter_url}{page_number}/" + try: + response = scraper.get(page_url_to_check, timeout=30) + if response.history or response.status_code != 200: + progress_callback(f" [Hentai2Read] End of chapter detected on page {page_number}.") + break + + soup = BeautifulSoup(response.text, 'html.parser') + img_tag = soup.select_one("img#arf-reader") + img_src = img_tag.get("src") if img_tag else None + + if not img_tag or img_src == "https://static.hentai.direct/hentai": + progress_callback(f" [Hentai2Read] End of chapter detected (Placeholder image on page {page_number}).") + break + + normalized_img_src = urljoin(response.url, img_src) + ext = os.path.splitext(normalized_img_src.split('/')[-1])[-1] or ".jpg" + filename = f"{page_number:03d}{ext}" + filepath = os.path.join(save_path, filename) + + # Put the download task into the queue for a worker to pick up + task_queue.put((filepath, normalized_img_src)) + + page_number += 1 + time.sleep(0.1) # Small delay between scraping pages + except Exception as e: + progress_callback(f" [Hentai2Read] ❌ Error while scraping page {page_number}: {e}") + break + + # --- Shutdown sequence --- + # Tell all worker threads to exit by sending the sentinel value + for _ in range(num_download_threads): + task_queue.put(None) + + # Wait for all download tasks to be completed + executor.shutdown(wait=True) + + progress_callback(f" Found and processed {page_number - 1} images for this chapter.") + return download_stats['downloaded'], download_stats['skipped'] \ No newline at end of file diff --git a/src/core/allcomic_client.py b/src/core/allcomic_client.py new file mode 100644 index 0000000..91ba1c4 --- /dev/null +++ b/src/core/allcomic_client.py @@ -0,0 +1,116 @@ +import requests +import re +from bs4 import BeautifulSoup +import cloudscraper +import time +from urllib.parse import urlparse + +def get_chapter_list(series_url, logger_func): + """ + Checks if a URL is a series page and returns a list of all chapter URLs if it is. + Includes a retry mechanism for robust connection. + """ + logger_func(f" [AllComic] Checking for chapter list at: {series_url}") + + scraper = cloudscraper.create_scraper() + response = None + max_retries = 8 + + for attempt in range(max_retries): + try: + response = scraper.get(series_url, timeout=30) + response.raise_for_status() + logger_func(f" [AllComic] Successfully connected to series page on attempt {attempt + 1}.") + break # Success, exit the loop + except requests.RequestException as e: + logger_func(f" [AllComic] ⚠️ Series page check attempt {attempt + 1}/{max_retries} failed: {e}") + if attempt < max_retries - 1: + wait_time = 2 * (attempt + 1) + logger_func(f" Retrying in {wait_time} seconds...") + time.sleep(wait_time) + else: + logger_func(f" [AllComic] ❌ All attempts to check series page failed.") + return [] # Return empty on final failure + + if not response: + return [] + + try: + soup = BeautifulSoup(response.text, 'html.parser') + chapter_links = soup.select('li.wp-manga-chapter a') + + if not chapter_links: + logger_func(" [AllComic] ℹ️ No chapter list found. Assuming this is a single chapter page.") + return [] + + chapter_urls = [link['href'] for link in chapter_links] + chapter_urls.reverse() # Reverse for oldest-to-newest reading order + + logger_func(f" [AllComic] ✅ Found {len(chapter_urls)} chapters.") + return chapter_urls + + except Exception as e: + logger_func(f" [AllComic] ❌ Error parsing chapters after successful connection: {e}") + return [] + +def fetch_chapter_data(chapter_url, logger_func): + """ + Fetches the comic title, chapter title, and image URLs for a single chapter page. + """ + logger_func(f" [AllComic] Fetching page: {chapter_url}") + + scraper = cloudscraper.create_scraper( + browser={'browser': 'firefox', 'platform': 'windows', 'desktop': True} + ) + headers = {'Referer': 'https://allporncomic.com/'} + + response = None + max_retries = 8 + for attempt in range(max_retries): + try: + response = scraper.get(chapter_url, headers=headers, timeout=30) + response.raise_for_status() + break + except requests.RequestException as e: + if attempt < max_retries - 1: + time.sleep(2 * (attempt + 1)) + else: + logger_func(f" [AllComic] ❌ All connection attempts failed for chapter: {chapter_url}") + return None, None, None + + try: + soup = BeautifulSoup(response.text, 'html.parser') + title_element = soup.find('h1', class_='post-title') + comic_title = None + if title_element: + comic_title = title_element.text.strip() + else: + try: + path_parts = urlparse(chapter_url).path.strip('/').split('/') + if len(path_parts) >= 3 and path_parts[-3] == 'porncomic': + comic_slug = path_parts[-2] + comic_title = comic_slug.replace('-', ' ').title() + except Exception: + comic_title = "Unknown Comic" + + chapter_slug = chapter_url.strip('/').split('/')[-1] + chapter_title = chapter_slug.replace('-', ' ').title() + + reading_container = soup.find('div', class_='reading-content') + list_of_image_urls = [] + if reading_container: + image_elements = reading_container.find_all('img', class_='wp-manga-chapter-img') + for img in image_elements: + img_url = (img.get('data-src') or img.get('src', '')).strip() + if img_url: + list_of_image_urls.append(img_url) + + if not comic_title or comic_title == "Unknown Comic" or not list_of_image_urls: + logger_func(f" [AllComic] ❌ Could not find a valid title or images on the page. Title found: '{comic_title}'") + return None, None, None + + return comic_title, chapter_title, list_of_image_urls + + except Exception as e: + logger_func(f" [AllComic] ❌ An unexpected error occurred while parsing the page: {e}") + return None, None, None \ No newline at end of file diff --git a/src/core/api_client.py b/src/core/api_client.py index 138ad9d..0493d1f 100644 --- a/src/core/api_client.py +++ b/src/core/api_client.py @@ -33,7 +33,7 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev if cancellation_event and cancellation_event.is_set(): raise RuntimeError("Fetch operation cancelled by user during retry loop.") - log_message = f" Fetching post list: {paginated_url} (Page approx. {offset // 50 + 1})" + log_message = f" Fetching post list: {api_url_base} (Page approx. {offset // 50 + 1})" if attempt > 0: log_message += f" (Attempt {attempt + 1}/{max_retries})" logger(log_message) @@ -247,7 +247,7 @@ def download_from_api( break all_posts_for_manga_mode.extend(posts_batch_manga) - logger(f"MANGA_FETCH_PROGRESS:{len(all_posts_for_manga_mode)}:{current_page_num_manga}") + logger(f"RENAMING_MODE_FETCH_PROGRESS:{len(all_posts_for_manga_mode)}:{current_page_num_manga}") current_offset_manga += page_size time.sleep(0.6) @@ -265,7 +265,7 @@ def download_from_api( if cancellation_event and cancellation_event.is_set(): return if all_posts_for_manga_mode: - logger(f"MANGA_FETCH_COMPLETE:{len(all_posts_for_manga_mode)}") + logger(f"RENAMING_MODE_FETCH_COMPLETE:{len(all_posts_for_manga_mode)}") if all_posts_for_manga_mode: if processed_post_ids: diff --git a/src/core/booru_client.py b/src/core/booru_client.py new file mode 100644 index 0000000..6ddcf13 --- /dev/null +++ b/src/core/booru_client.py @@ -0,0 +1,375 @@ +# src/core/booru_client.py + +import os +import re +import time +import datetime +import urllib.parse +import requests +import logging +import cloudscraper +# --- Start of Combined Code from 1.py --- + +# Part 1: Essential Utilities & Exceptions + +class BooruClientException(Exception): + """Base class for exceptions in this client.""" + pass + +class HttpError(BooruClientException): + """HTTP request during data extraction failed.""" + def __init__(self, message="", response=None): + self.response = response + self.status = response.status_code if response else 0 + if response and not message: + message = f"'{response.status_code} {response.reason}' for '{response.url}'" + super().__init__(message) + +class NotFoundError(BooruClientException): + pass + +def unquote(s): + return urllib.parse.unquote(s) + +def parse_datetime(date_string, fmt): + try: + # Assumes date_string is in a format that strptime can handle with timezone + return datetime.datetime.strptime(date_string, fmt) + except (ValueError, TypeError): + return None + +def nameext_from_url(url, data=None): + if data is None: data = {} + try: + path = urllib.parse.urlparse(url).path + filename = unquote(os.path.basename(path)) + if '.' in filename: + name, ext = filename.rsplit('.', 1) + data["filename"], data["extension"] = name, ext.lower() + else: + data["filename"], data["extension"] = filename, "" + except Exception: + data["filename"], data["extension"] = "", "" + return data + +USERAGENT_FIREFOX = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0" + +# Part 2: Core Extractor Logic + +class Extractor: + category = "" + subcategory = "" + directory_fmt = ("{category}", "{id}") + filename_fmt = "{filename}.{extension}" + _retries = 3 + _timeout = 30 + + def __init__(self, match, logger_func=print): + self.url = match.string + self.match = match + self.groups = match.groups() + self.session = cloudscraper.create_scraper() + self.session.headers["User-Agent"] = USERAGENT_FIREFOX + self.log = logger_func + self.api_key = None + self.user_id = None + + def set_auth(self, api_key, user_id): + self.api_key = api_key + self.user_id = user_id + self._init_auth() + + def _init_auth(self): + """Placeholder for extractor-specific auth setup.""" + pass + + def request(self, url, method="GET", fatal=True, **kwargs): + for attempt in range(self._retries + 1): + try: + response = self.session.request(method, url, timeout=self._timeout, **kwargs) + if response.status_code < 400: + return response + if response.status_code == 404 and fatal: + raise NotFoundError(f"Resource not found at {url}") + self.log(f"Request for {url} failed with status {response.status_code}. Retrying...") + except requests.exceptions.RequestException as e: + self.log(f"Request for {url} failed: {e}. Retrying...") + if attempt < self._retries: + time.sleep(2 ** attempt) + if fatal: + raise HttpError(f"Failed to retrieve {url} after {self._retries} retries.") + return None + + def request_json(self, url, **kwargs): + response = self.request(url, **kwargs) + try: + return response.json() + except (ValueError, TypeError) as exc: + self.log(f"Failed to decode JSON from {url}: {exc}") + raise BooruClientException("Invalid JSON response") + + def items(self): + data = self.metadata() + for item in self.posts(): + # Check for our special page update message + if isinstance(item, tuple) and item[0] == 'PAGE_UPDATE': + yield item + continue + + # Otherwise, process it as a post + post = item + url = post.get("file_url") + if not url: continue + + nameext_from_url(url, post) + post["date"] = parse_datetime(post.get("created_at"), "%Y-%m-%dT%H:%M:%S.%f%z") + + if url.startswith("/"): + url = self.root + url + post['file_url'] = url # Ensure full URL + + post.update(data) + yield post + +class BaseExtractor(Extractor): + instances = () + + def __init__(self, match, logger_func=print): + super().__init__(match, logger_func) + self._init_category() + + def _init_category(self): + parsed_url = urllib.parse.urlparse(self.url) + self.root = f"{parsed_url.scheme}://{parsed_url.netloc}" + for i, group in enumerate(self.groups): + if group is not None: + try: + self.category = self.instances[i][0] + return + except IndexError: + continue + + @classmethod + def update(cls, instances): + pattern_list = [] + instance_list = cls.instances = [] + for category, info in instances.items(): + root = info["root"].rstrip("/") if info["root"] else "" + instance_list.append((category, root, info)) + pattern = info.get("pattern", re.escape(root.partition("://")[2])) + pattern_list.append(f"({pattern})") + return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")" + +# Part 3: Danbooru Extractor + +class DanbooruExtractor(BaseExtractor): + filename_fmt = "{category}_{id}_{filename}.{extension}" + per_page = 200 + + def __init__(self, match, logger_func=print): + super().__init__(match, logger_func) + self._auth_logged = False + + def _init_auth(self): + if self.user_id and self.api_key: + if not self._auth_logged: + self.log("Danbooru auth set.") + self._auth_logged = True + self.session.auth = (self.user_id, self.api_key) + + + def items(self): + data = self.metadata() + for item in self.posts(): + # Check for our special page update message + if isinstance(item, tuple) and item[0] == 'PAGE_UPDATE': + yield item + continue + + # Otherwise, process it as a post + post = item + url = post.get("file_url") + if not url: continue + + nameext_from_url(url, post) + post["date"] = parse_datetime(post.get("created_at"), "%Y-%m-%dT%H:%M:%S.%f%z") + + if url.startswith("/"): + url = self.root + url + post['file_url'] = url # Ensure full URL + + post.update(data) + yield post + + def metadata(self): + return {} + + def posts(self): + return [] + + def _pagination(self, endpoint, params, prefix="b"): + url = self.root + endpoint + params["limit"] = self.per_page + params["page"] = 1 + threshold = self.per_page - 20 + + while True: + posts = self.request_json(url, params=params) + if not posts: break + yield ('PAGE_UPDATE', len(posts)) + yield from posts + if len(posts) < threshold: return + if prefix: + params["page"] = f"{prefix}{posts[-1]['id']}" + else: + params["page"] += 1 + +BASE_PATTERN = DanbooruExtractor.update({ + "danbooru": {"root": None, "pattern": r"(?:danbooru|safebooru)\.donmai\.us"}, +}) + +class DanbooruTagExtractor(DanbooruExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + pattern = BASE_PATTERN + r"(/posts\?(?:[^&#]*&)*tags=([^&#]*))" + + def metadata(self): + self.tags = unquote(self.groups[-1].replace("+", " ")).strip() + sanitized_tags = re.sub(r'[\\/*?:"<>|]', "_", self.tags) + return {"search_tags": sanitized_tags} + + def posts(self): + return self._pagination("/posts.json", {"tags": self.tags}) + +class DanbooruPostExtractor(DanbooruExtractor): + subcategory = "post" + pattern = BASE_PATTERN + r"(/post(?:s|/show)/(\d+))" + + def posts(self): + post_id = self.groups[-1] + url = f"{self.root}/posts/{post_id}.json" + post = self.request_json(url) + return (post,) if post else () + +class GelbooruBase(Extractor): + category = "gelbooru" + root = "https://gelbooru.com" + + def __init__(self, match, logger_func=print): + super().__init__(match, logger_func) + self._auth_logged = False + + def _api_request(self, params, key="post"): + # Auth is now added dynamically + if self.api_key and self.user_id: + if not self._auth_logged: + self.log("Gelbooru auth set.") + self._auth_logged = True + params.update({"api_key": self.api_key, "user_id": self.user_id}) + + url = self.root + "/index.php?page=dapi&q=index&json=1" + data = self.request_json(url, params=params) + + if not key: return data + posts = data.get(key, []) + return posts if isinstance(posts, list) else [posts] if posts else [] + + def items(self): + base_data = self.metadata() + base_data['category'] = self.category + + for item in self.posts(): + # Check for our special page update message + if isinstance(item, tuple) and item[0] == 'PAGE_UPDATE': + yield item + continue + + # Otherwise, process it as a post + post = item + url = post.get("file_url") + if not url: continue + + data = base_data.copy() + data.update(post) + nameext_from_url(url, data) + yield data + + def metadata(self): return {} + def posts(self): return [] + +GELBOORU_PATTERN = r"(?:https?://)?(?:www\.)?gelbooru\.com" + +class GelbooruTagExtractor(GelbooruBase): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + filename_fmt = "{category}_{id}_{md5}.{extension}" + pattern = GELBOORU_PATTERN + r"(/index\.php\?page=post&s=list&tags=([^&#]*))" + + def metadata(self): + self.tags = unquote(self.groups[-1].replace("+", " ")).strip() + sanitized_tags = re.sub(r'[\\/*?:"<>|]', "_", self.tags) + return {"search_tags": sanitized_tags} + + def posts(self): + """Scrapes HTML search pages as API can be restrictive for tags.""" + pid = 0 + posts_per_page = 42 + search_url = self.root + "/index.php" + params = {"page": "post", "s": "list", "tags": self.tags} + + while True: + params['pid'] = pid + self.log(f"Scraping search results page (offset: {pid})...") + response = self.request(search_url, params=params) + html_content = response.text + post_ids = re.findall(r'id="p(\d+)"', html_content) + + if not post_ids: + self.log("No more posts found on page. Ending scrape.") + break + yield ('PAGE_UPDATE', len(post_ids)) + for post_id in post_ids: + post_data = self._api_request({"s": "post", "id": post_id}) + yield from post_data + + pid += posts_per_page + +class GelbooruPostExtractor(GelbooruBase): + subcategory = "post" + filename_fmt = "{category}_{id}_{md5}.{extension}" + pattern = GELBOORU_PATTERN + r"(/index\.php\?page=post&s=view&id=(\d+))" + + def posts(self): + post_id = self.groups[-1] + return self._api_request({"s": "post", "id": post_id}) + +# --- Main Entry Point --- + +EXTRACTORS = [ + DanbooruTagExtractor, + DanbooruPostExtractor, + GelbooruTagExtractor, + GelbooruPostExtractor, +] + +def find_extractor(url, logger_func): + for extractor_cls in EXTRACTORS: + match = re.search(extractor_cls.pattern, url) + if match: + return extractor_cls(match, logger_func) + return None + +def fetch_booru_data(url, api_key, user_id, logger_func): + """ + Main function to find an extractor and yield image data. + """ + extractor = find_extractor(url, logger_func) + if not extractor: + logger_func(f"No suitable Booru extractor found for URL: {url}") + return + + logger_func(f"Using extractor: {extractor.__class__.__name__}") + extractor.set_auth(api_key, user_id) + + # The 'items' method will now yield the data dictionaries directly + yield from extractor.items() \ No newline at end of file diff --git a/src/core/bunkr_client.py b/src/core/bunkr_client.py index 7821419..9480891 100644 --- a/src/core/bunkr_client.py +++ b/src/core/bunkr_client.py @@ -207,7 +207,7 @@ def get_bunkr_extractor(url, logger): def fetch_bunkr_data(url, logger): """ Main function to be called from the GUI. - It extracts all file information from a Bunkr URL. + It extracts all file information from a Bunkr URL, now handling both albums and direct file links. Returns: A tuple of (album_name, list_of_files) @@ -215,6 +215,30 @@ def fetch_bunkr_data(url, logger): - list_of_files (list): A list of dicts, each containing 'url', 'name', and '_http_headers'. Returns (None, None) on failure. """ + # --- START: New logic to handle direct CDN file URLs --- + try: + parsed_url = urllib.parse.urlparse(url) + # Check if the hostname contains 'cdn' and the path has a common file extension + is_direct_cdn_file = (parsed_url.hostname and 'cdn' in parsed_url.hostname and 'bunkr' in parsed_url.hostname and + any(parsed_url.path.lower().endswith(ext) for ext in ['.mp4', '.mkv', '.webm', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.rar'])) + + if is_direct_cdn_file: + logger.info("Bunkr direct file URL detected.") + filename = os.path.basename(parsed_url.path) + # Use the filename (without extension) as a sensible album name + album_name = os.path.splitext(filename)[0] + + files_to_download = [{ + 'url': url, + 'name': filename, + '_http_headers': {'Referer': 'https://bunkr.ru/'} # Use a generic Referer + }] + return album_name, files_to_download + except Exception as e: + logger.warning(f"Could not parse Bunkr URL for direct file check: {e}") + # --- END: New logic --- + + # This is the original logic for album and media pages extractor = get_bunkr_extractor(url, logger) if not extractor: return None, None @@ -238,4 +262,4 @@ def fetch_bunkr_data(url, logger): except Exception as e: logger.error(f"An error occurred while extracting Bunkr info: {e}", exc_info=True) - return None, None \ No newline at end of file + return None, None diff --git a/src/core/fap_nation_client.py b/src/core/fap_nation_client.py new file mode 100644 index 0000000..71d5aea --- /dev/null +++ b/src/core/fap_nation_client.py @@ -0,0 +1,125 @@ +import re +import os +import cloudscraper +from urllib.parse import urlparse, urljoin +from ..utils.file_utils import clean_folder_name + +def fetch_fap_nation_data(album_url, logger_func): + """ + Scrapes a fap-nation page by prioritizing HLS streams first, then falling + back to direct download links. Selects the highest quality available. + """ + logger_func(f" [Fap-Nation] Fetching album data from: {album_url}") + scraper = cloudscraper.create_scraper() + + try: + response = scraper.get(album_url, timeout=45) + response.raise_for_status() + html_content = response.text + + title_match = re.search(r']*itemprop="name"[^>]*>(.*?)', html_content, re.IGNORECASE) + album_slug = clean_folder_name(os.path.basename(urlparse(album_url).path.strip('/'))) + album_title = clean_folder_name(title_match.group(1).strip()) if title_match else album_slug + + files_to_download = [] + final_url = None + link_type = None + filename_from_video_tag = None + + video_tag_title_match = re.search(r'data-plyr-config=.*?"title":.*?"([^&]+?\.mp4)"', html_content, re.IGNORECASE) + if video_tag_title_match: + filename_from_video_tag = clean_folder_name(video_tag_title_match.group(1)) + logger_func(f" [Fap-Nation] Found high-quality filename in video tag: {filename_from_video_tag}") + + # --- REVISED LOGIC: HLS FIRST --- + + # 1. Prioritize finding an HLS stream. + logger_func(" [Fap-Nation] Priority 1: Searching for HLS stream...") + iframe_match = re.search(r']+src="([^"]+mediadelivery\.net[^"]+)"', html_content, re.IGNORECASE) + + if iframe_match: + iframe_url = iframe_match.group(1) + logger_func(f" [Fap-Nation] Found video iframe. Visiting: {iframe_url}") + try: + iframe_response = scraper.get(iframe_url, timeout=30) + iframe_response.raise_for_status() + iframe_html = iframe_response.text + + playlist_match = re.search(r']+src="([^"]+\.m3u8)"', iframe_html, re.IGNORECASE) + if playlist_match: + final_url = playlist_match.group(1) + link_type = 'hls' + logger_func(f" [Fap-Nation] Found embedded HLS stream in iframe: {final_url}") + except Exception as e: + logger_func(f" [Fap-Nation] ⚠️ Error fetching or parsing iframe content: {e}") + + if not final_url: + logger_func(" [Fap-Nation] No stream found in iframe. Checking main page content as a last resort...") + js_var_match = re.search(r'"(https?://[^"]+\.m3u8)"', html_content, re.IGNORECASE) + if js_var_match: + final_url = js_var_match.group(1) + link_type = 'hls' + logger_func(f" [Fap-Nation] Found HLS stream on main page: {final_url}") + + # 2. Fallback: If no HLS stream was found, search for direct links. + if not final_url: + logger_func(" [Fap-Nation] No HLS stream found. Priority 2 (Fallback): Searching for direct download links...") + direct_link_pattern = r']*href="([^"]+\.(?:mp4|webm|mkv|mov))"[^>]*>' + direct_links_found = re.findall(direct_link_pattern, html_content, re.IGNORECASE) + + if direct_links_found: + logger_func(f" [Fap-Nation] Found {len(direct_links_found)} direct media link(s). Selecting the best quality...") + best_link = direct_links_found[0] + for link in direct_links_found: + if '1080p' in link.lower(): + best_link = link + break + final_url = best_link + link_type = 'direct' + logger_func(f" [Fap-Nation] Identified direct media link: {final_url}") + + # If after all checks, we still have no URL, then fail. + if not final_url: + logger_func(" [Fap-Nation] ❌ Stage 1 Failed: Could not find any HLS stream or direct link.") + return None, [] + + # --- HLS Quality Selection Logic --- + if link_type == 'hls' and final_url: + logger_func(" [Fap-Nation] HLS stream found. Checking for higher quality variants...") + try: + master_playlist_response = scraper.get(final_url, timeout=20) + master_playlist_response.raise_for_status() + playlist_content = master_playlist_response.text + + streams = re.findall(r'#EXT-X-STREAM-INF:.*?RESOLUTION=(\d+)x(\d+).*?\n(.*?)\s', playlist_content) + + if streams: + best_stream = max(streams, key=lambda s: int(s[0]) * int(s[1])) + height = best_stream[1] + relative_path = best_stream[2] + new_final_url = urljoin(final_url, relative_path) + + logger_func(f" [Fap-Nation] ✅ Best quality found: {height}p. Updating URL to: {new_final_url}") + final_url = new_final_url + else: + logger_func(" [Fap-Nation] ℹ️ No alternate quality streams found in playlist. Using original.") + except Exception as e: + logger_func(f" [Fap-Nation] ⚠️ Could not parse HLS master playlist for quality selection: {e}. Using original URL.") + + if final_url and link_type: + if filename_from_video_tag: + base_name, _ = os.path.splitext(filename_from_video_tag) + new_filename = f"{base_name}.mp4" + else: + new_filename = f"{album_slug}.mp4" + + files_to_download.append({'url': final_url, 'filename': new_filename, 'type': link_type}) + logger_func(f" [Fap-Nation] ✅ Ready to download '{new_filename}' ({link_type} method).") + return album_title, files_to_download + + logger_func(f" [Fap-Nation] ❌ Could not determine a valid download link.") + return None, [] + + except Exception as e: + logger_func(f" [Fap-Nation] ❌ Error fetching Fap-Nation data: {e}") + return None, [] \ No newline at end of file diff --git a/src/core/mangadex_client.py b/src/core/mangadex_client.py new file mode 100644 index 0000000..961b38e --- /dev/null +++ b/src/core/mangadex_client.py @@ -0,0 +1,189 @@ +# src/core/mangadex_client.py + +import os +import re +import time +import cloudscraper +from collections import defaultdict +from ..utils.file_utils import clean_folder_name + +def fetch_mangadex_data(start_url, output_dir, logger_func, file_progress_callback, overall_progress_callback, pause_event, cancellation_event): + """ + Fetches and downloads all content from a MangaDex series or chapter URL. + Returns a tuple of (downloaded_count, skipped_count). + """ + grand_total_dl = 0 + grand_total_skip = 0 + + api = _MangadexAPI(logger_func) + + def _check_pause(): + if cancellation_event and cancellation_event.is_set(): return True + if pause_event and pause_event.is_set(): + logger_func(" Download paused...") + while pause_event.is_set(): + if cancellation_event and cancellation_event.is_set(): return True + time.sleep(0.5) + logger_func(" Download resumed.") + return cancellation_event.is_set() + + series_match = re.search(r"mangadex\.org/(?:title|manga)/([0-9a-f-]+)", start_url) + chapter_match = re.search(r"mangadex\.org/chapter/([0-9a-f-]+)", start_url) + + chapters_to_process = [] + if series_match: + series_id = series_match.group(1) + logger_func(f" Series detected. Fetching chapter list for ID: {series_id}") + chapters_to_process = api.get_manga_chapters(series_id, cancellation_event, pause_event) + elif chapter_match: + chapter_id = chapter_match.group(1) + logger_func(f" Single chapter detected. Fetching info for ID: {chapter_id}") + chapter_info = api.get_chapter_info(chapter_id) + if chapter_info: + chapters_to_process = [chapter_info] + + if not chapters_to_process: + logger_func("❌ No chapters found or failed to fetch chapter info.") + return 0, 0 + + logger_func(f"✅ Found {len(chapters_to_process)} chapter(s) to download.") + if overall_progress_callback: + overall_progress_callback.emit(len(chapters_to_process), 0) + + for chap_idx, chapter_json in enumerate(chapters_to_process): + if _check_pause(): break + try: + metadata = api.transform_chapter_data(chapter_json) + logger_func("-" * 40) + logger_func(f"Processing Chapter {chap_idx + 1}/{len(chapters_to_process)}: Vol. {metadata['volume']} Ch. {metadata['chapter']}{metadata['chapter_minor']} - {metadata['title']}") + + server_info = api.get_at_home_server(chapter_json["id"]) + if not server_info: + logger_func(" ❌ Could not get image server for this chapter. Skipping.") + continue + + base_url = f"{server_info['baseUrl']}/data/{server_info['chapter']['hash']}/" + image_files = server_info['chapter']['data'] + + series_folder = clean_folder_name(metadata['manga']) + chapter_folder_title = metadata['title'] or '' + chapter_folder = clean_folder_name(f"Vol {metadata['volume']:02d} Chap {metadata['chapter']:03d}{metadata['chapter_minor']} - {chapter_folder_title}".strip().strip('-').strip()) + final_save_path = os.path.join(output_dir, series_folder, chapter_folder) + os.makedirs(final_save_path, exist_ok=True) + + for img_idx, filename in enumerate(image_files): + if _check_pause(): break + + full_img_url = base_url + filename + img_path = os.path.join(final_save_path, f"{img_idx + 1:03d}{os.path.splitext(filename)[1]}") + + if os.path.exists(img_path): + logger_func(f" -> Skip ({img_idx+1}/{len(image_files)}): '{os.path.basename(img_path)}' already exists.") + grand_total_skip += 1 + continue + + logger_func(f" Downloading ({img_idx+1}/{len(image_files)}): '{os.path.basename(img_path)}'...") + + try: + response = api.session.get(full_img_url, stream=True, timeout=60, headers={'Referer': 'https://mangadex.org/'}) + response.raise_for_status() + total_size = int(response.headers.get('content-length', 0)) + + if file_progress_callback: + file_progress_callback.emit(os.path.basename(img_path), (0, total_size)) + + with open(img_path, 'wb') as f: + downloaded_bytes = 0 + for chunk in response.iter_content(chunk_size=8192): + if _check_pause(): break + f.write(chunk) + downloaded_bytes += len(chunk) + if file_progress_callback: + file_progress_callback.emit(os.path.basename(img_path), (downloaded_bytes, total_size)) + + if _check_pause(): + if os.path.exists(img_path): os.remove(img_path) + break + + grand_total_dl += 1 + except Exception as e: + logger_func(f" ❌ Failed to download page {img_idx+1}: {e}") + grand_total_skip += 1 + + if overall_progress_callback: + overall_progress_callback.emit(len(chapters_to_process), chap_idx + 1) + time.sleep(1) + + except Exception as e: + logger_func(f" ❌ An unexpected error occurred while processing chapter {chapter_json.get('id')}: {e}") + + return grand_total_dl, grand_total_skip + +class _MangadexAPI: + def __init__(self, logger_func): + self.logger_func = logger_func + self.session = cloudscraper.create_scraper() + self.root = "https://api.mangadex.org" + + def _call(self, endpoint, params=None, cancellation_event=None): + if cancellation_event and cancellation_event.is_set(): return None + try: + response = self.session.get(f"{self.root}{endpoint}", params=params, timeout=30) + if response.status_code == 429: + retry_after = int(response.headers.get("X-RateLimit-Retry-After", 5)) + self.logger_func(f" ⚠️ Rate limited. Waiting for {retry_after} seconds...") + time.sleep(retry_after) + return self._call(endpoint, params, cancellation_event) + response.raise_for_status() + return response.json() + except Exception as e: + self.logger_func(f" ❌ API call to '{endpoint}' failed: {e}") + return None + + def get_manga_chapters(self, series_id, cancellation_event, pause_event): + all_chapters = [] + offset = 0 + limit = 500 + base_params = { + "limit": limit, "order[volume]": "asc", "order[chapter]": "asc", + "translatedLanguage[]": ["en"], "includes[]": ["scanlation_group", "user", "manga"] + } + while True: + if cancellation_event.is_set(): break + while pause_event.is_set(): time.sleep(0.5) + + params = {**base_params, "offset": offset} + data = self._call(f"/manga/{series_id}/feed", params, cancellation_event) + if not data or data.get("result") != "ok": break + + results = data.get("data", []) + all_chapters.extend(results) + + if (offset + limit) >= data.get("total", 0): break + offset += limit + return all_chapters + + def get_chapter_info(self, chapter_id): + params = {"includes[]": ["scanlation_group", "user", "manga"]} + data = self._call(f"/chapter/{chapter_id}", params) + return data.get("data") if data and data.get("result") == "ok" else None + + def get_at_home_server(self, chapter_id): + return self._call(f"/at-home/server/{chapter_id}") + + def transform_chapter_data(self, chapter): + relationships = {item["type"]: item for item in chapter.get("relationships", [])} + manga = relationships.get("manga", {}) + c_attrs = chapter.get("attributes", {}) + m_attrs = manga.get("attributes", {}) + + chapter_num_str = c_attrs.get("chapter", "0") or "0" + chnum, sep, minor = chapter_num_str.partition(".") + + return { + "manga": (m_attrs.get("title", {}).get("en") or next(iter(m_attrs.get("title", {}).values()), "Unknown Series")), + "title": c_attrs.get("title", ""), + "volume": int(float(c_attrs.get("volume", 0) or 0)), + "chapter": int(float(chnum or 0)), + "chapter_minor": sep + minor if minor else "" + } \ No newline at end of file diff --git a/src/core/pixeldrain_client.py b/src/core/pixeldrain_client.py new file mode 100644 index 0000000..9b019b7 --- /dev/null +++ b/src/core/pixeldrain_client.py @@ -0,0 +1,93 @@ +import os +import re +import cloudscraper +from ..utils.file_utils import clean_folder_name +# --- ADDED IMPORTS --- +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +def fetch_pixeldrain_data(url: str, logger): + """ + Scrapes a given Pixeldrain URL to extract album or file information. + Handles single files (/u/), albums/lists (/l/), and folders (/d/). + """ + logger(f"Fetching data for Pixeldrain URL: {url}") + scraper = cloudscraper.create_scraper() + root = "https://pixeldrain.com" + + # --- START OF FIX: Add a robust retry strategy --- + try: + retry_strategy = Retry( + total=5, # Total number of retries + backoff_factor=1, # Wait 1s, 2s, 4s, 8s between retries + status_forcelist=[429, 500, 502, 503, 504], # Retry on these server errors + allowed_methods=["HEAD", "GET"] + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + scraper.mount("https://", adapter) + scraper.mount("http://", adapter) + logger(" [Pixeldrain] Configured retry strategy for network requests.") + except Exception as e: + logger(f" [Pixeldrain] ⚠️ Could not configure retry strategy: {e}") + # --- END OF FIX --- + + file_match = re.search(r"/u/(\w+)", url) + album_match = re.search(r"/l/(\w+)", url) + folder_match = re.search(r"/d/([^?]+)", url) + + try: + if file_match: + file_id = file_match.group(1) + logger(f" Detected Pixeldrain File ID: {file_id}") + api_url = f"{root}/api/file/{file_id}/info" + data = scraper.get(api_url).json() + + title = data.get("name", file_id) + + files = [{ + 'url': f"{root}/api/file/{file_id}?download", + 'filename': data.get("name", f"{file_id}.tmp") + }] + return title, files + + elif album_match: + album_id = album_match.group(1) + logger(f" Detected Pixeldrain Album ID: {album_id}") + api_url = f"{root}/api/list/{album_id}" + data = scraper.get(api_url).json() + + title = data.get("title", album_id) + + files = [] + for file_info in data.get("files", []): + files.append({ + 'url': f"{root}/api/file/{file_info['id']}?download", + 'filename': file_info.get("name", f"{file_info['id']}.tmp") + }) + return title, files + + elif folder_match: + path_id = folder_match.group(1) + logger(f" Detected Pixeldrain Folder Path: {path_id}") + api_url = f"{root}/api/filesystem/{path_id}?stat" + data = scraper.get(api_url).json() + + path_info = data["path"][data["base_index"]] + title = path_info.get("name", path_id) + + files = [] + for child in data.get("children", []): + if child.get("type") == "file": + files.append({ + 'url': f"{root}/api/filesystem{child['path']}?attach", + 'filename': child.get("name") + }) + return title, files + + else: + logger(" ❌ Could not identify Pixeldrain URL type (file, album, or folder).") + return None, [] + + except Exception as e: + logger(f"❌ An error occurred while fetching Pixeldrain data: {e}") + return None, [] \ No newline at end of file diff --git a/src/core/simpcity_client.py b/src/core/simpcity_client.py new file mode 100644 index 0000000..bc427be --- /dev/null +++ b/src/core/simpcity_client.py @@ -0,0 +1,100 @@ +# src/core/simpcity_client.py + +import cloudscraper +from bs4 import BeautifulSoup +from urllib.parse import urlparse, unquote +import os +import re +from ..utils.file_utils import clean_folder_name +import urllib.parse + +def fetch_single_simpcity_page(url, logger_func, cookies=None, post_id=None): + """ + Scrapes a single SimpCity page for images, external links, video tags, and iframes. + """ + scraper = cloudscraper.create_scraper() + headers = {'Referer': 'https://simpcity.cr/'} + + try: + response = scraper.get(url, timeout=30, headers=headers, cookies=cookies) + if response.status_code == 404: + return None, [] + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + + album_title = None + title_element = soup.find('h1', class_='p-title-value') + if title_element: + album_title = title_element.text.strip() + + search_scope = soup + if post_id: + post_content_container = soup.find('div', attrs={'data-lb-id': f'post-{post_id}'}) + if post_content_container: + logger_func(f" [SimpCity] ✅ Isolating search to post content container for ID {post_id}.") + search_scope = post_content_container + else: + logger_func(f" [SimpCity] ⚠️ Could not find content container for post ID {post_id}.") + + jobs_on_page = [] + + # Find native SimpCity images + image_tags = search_scope.find_all('img', class_='bbImage') + for img_tag in image_tags: + thumbnail_url = img_tag.get('src') + if not thumbnail_url or not isinstance(thumbnail_url, str) or 'saint2.su' in thumbnail_url: continue + full_url = thumbnail_url.replace('.md.', '.') + filename = img_tag.get('alt', '').replace('.md.', '.') or os.path.basename(unquote(urlparse(full_url).path)) + jobs_on_page.append({'type': 'image', 'filename': filename, 'url': full_url}) + + # Find links in tags, now with redirect handling + link_tags = search_scope.find_all('a', href=True) + for link in link_tags: + href = link.get('href', '') + + actual_url = href + if '/misc/goto?url=' in href: + try: + # Extract and decode the real URL from the 'url' parameter + parsed_href = urlparse(href) + query_params = dict(urllib.parse.parse_qsl(parsed_href.query)) + if 'url' in query_params: + actual_url = unquote(query_params['url']) + except Exception: + actual_url = href # Fallback if parsing fails + + # Perform all checks on the 'actual_url' which is now the real destination + if re.search(r'pixeldrain\.com/[lud]/', actual_url): jobs_on_page.append({'type': 'pixeldrain', 'url': actual_url}) + elif re.search(r'saint2\.(su|pk|cr|to)/embed/', actual_url): jobs_on_page.append({'type': 'saint2', 'url': actual_url}) + elif re.search(r'bunkr\.(?:cr|si|la|ws|is|ru|su|red|black|media|site|to|ac|ci|fi|pk|ps|sk|ph)|bunkrr\.ru', actual_url): jobs_on_page.append({'type': 'bunkr', 'url': actual_url}) + elif re.search(r'mega\.(nz|io)', actual_url): jobs_on_page.append({'type': 'mega', 'url': actual_url}) + elif re.search(r'gofile\.io', actual_url): jobs_on_page.append({'type': 'gofile', 'url': actual_url}) + + # Find direct Saint2 video embeds in