From 65db2625508c220fd3c0a1f37cdd2e13b6e02987 Mon Sep 17 00:00:00 2001 From: Claire Date: Tue, 2 Mar 2021 12:02:56 +0100 Subject: [PATCH] Update twitter-text from 1.14 to 3.1.0 and fix toot character counting (#15382) * Update twitter-text from 1.14 to 3.1.0 * Disable emoji parsing * Properly depend on twitter-text for url detection * Fix some URLs being wrongly detected client-side * Add test for server-side validation of non-autolinkable URLs * Fix server-side status length counting --- Gemfile | 2 +- Gemfile.lock | 5 +- .../mastodon/features/compose/util/counter.js | 2 +- .../features/compose/util/url_regex.js | 222 +++--------------- app/lib/extractor.rb | 8 +- app/lib/formatter.rb | 2 +- app/services/fetch_link_card_service.rb | 12 +- app/validators/status_length_validator.rb | 9 +- config/initializers/twitter_regex.rb | 10 +- package.json | 1 + spec/lib/formatter_spec.rb | 8 + .../status_length_validator_spec.rb | 8 + yarn.lock | 24 +- 13 files changed, 99 insertions(+), 214 deletions(-) diff --git a/Gemfile b/Gemfile index e9b90a147dc..53ee517fe80 100644 --- a/Gemfile +++ b/Gemfile @@ -93,7 +93,7 @@ gem 'sprockets-rails', '~> 3.2', require: 'sprockets/railtie' gem 'stoplight', '~> 2.2.1' gem 'strong_migrations', '~> 0.7' gem 'tty-prompt', '~> 0.23', require: false -gem 'twitter-text', '~> 1.14' +gem 'twitter-text', '~> 3.1.0' gem 'tzinfo-data', '~> 1.2021' gem 'webpacker', '~> 5.2' gem 'webpush' diff --git a/Gemfile.lock b/Gemfile.lock index ced6b9df9fe..8d1e6696257 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -641,7 +641,8 @@ GEM tty-screen (~> 0.8) wisper (~> 2.0) tty-screen (0.8.1) - twitter-text (1.14.7) + twitter-text (3.1.0) + idn-ruby unf (~> 0.1.0) tzinfo (1.2.9) thread_safe (~> 0.1) @@ -810,7 +811,7 @@ DEPENDENCIES strong_migrations (~> 0.7) thor (~> 1.1) tty-prompt (~> 0.23) - twitter-text (~> 1.14) + twitter-text (~> 3.1.0) tzinfo-data (~> 1.2021) webauthn (~> 3.0.0.alpha1) webmock (~> 3.12) diff --git a/app/javascript/mastodon/features/compose/util/counter.js b/app/javascript/mastodon/features/compose/util/counter.js index 700ba216354..7aa9e87b1b6 100644 --- a/app/javascript/mastodon/features/compose/util/counter.js +++ b/app/javascript/mastodon/features/compose/util/counter.js @@ -1,6 +1,6 @@ import { urlRegex } from './url_regex'; -const urlPlaceholder = 'xxxxxxxxxxxxxxxxxxxxxxx'; +const urlPlaceholder = '$2xxxxxxxxxxxxxxxxxxxxxxx'; export function countableText(inputText) { return inputText diff --git a/app/javascript/mastodon/features/compose/util/url_regex.js b/app/javascript/mastodon/features/compose/util/url_regex.js index 7f1e176207b..9c2005c53d2 100644 --- a/app/javascript/mastodon/features/compose/util/url_regex.js +++ b/app/javascript/mastodon/features/compose/util/url_regex.js @@ -1,196 +1,30 @@ -const regexen = {}; +import regexSupplant from 'twitter-text/dist/lib/regexSupplant'; +import validUrlPrecedingChars from 'twitter-text/dist/regexp/validUrlPrecedingChars'; +import validDomain from 'twitter-text/dist/regexp/validDomain'; +import validPortNumber from 'twitter-text/dist/regexp/validPortNumber'; +import validUrlPath from 'twitter-text/dist/regexp/validUrlPath'; +import validUrlQueryChars from 'twitter-text/dist/regexp/validUrlQueryChars'; +import validUrlQueryEndingChars from 'twitter-text/dist/regexp/validUrlQueryEndingChars'; -const regexSupplant = function(regex, flags) { - flags = flags || ''; - if (typeof regex !== 'string') { - if (regex.global && flags.indexOf('g') < 0) { - flags += 'g'; - } - if (regex.ignoreCase && flags.indexOf('i') < 0) { - flags += 'i'; - } - if (regex.multiline && flags.indexOf('m') < 0) { - flags += 'm'; - } +// The difference with twitter-text's extractURL is that the protocol isn't +// optional. - regex = regex.source; - } - return new RegExp(regex.replace(/#\{(\w+)\}/g, function(match, name) { - var newRegex = regexen[name] || ''; - if (typeof newRegex !== 'string') { - newRegex = newRegex.source; - } - return newRegex; - }), flags); -}; - -const stringSupplant = function(str, values) { - return str.replace(/#\{(\w+)\}/g, function(match, name) { - return values[name] || ''; - }); -}; - -export const urlRegex = (function() { - regexen.spaces_group = /\x09-\x0D\x20\x85\xA0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000/; - regexen.invalid_chars_group = /\uFFFE\uFEFF\uFFFF\u202A-\u202E/; - regexen.punct = /\!'#%&'\(\)*\+,\\\-\.\/:;<=>\?@\[\]\^_{|}~\$/; - regexen.validUrlPrecedingChars = regexSupplant(/(?:[^A-Za-z0-9@@$###{invalid_chars_group}]|^)/); - regexen.invalidDomainChars = stringSupplant('#{punct}#{spaces_group}#{invalid_chars_group}', regexen); - regexen.validDomainChars = regexSupplant(/[^#{invalidDomainChars}]/); - regexen.validSubdomain = regexSupplant(/(?:(?:#{validDomainChars}(?:[_-]|#{validDomainChars})*)?#{validDomainChars}\.)/); - regexen.validDomainName = regexSupplant(/(?:(?:#{validDomainChars}(?:-|#{validDomainChars})*)?#{validDomainChars}\.)/); - regexen.validGTLD = regexSupplant(RegExp( - '(?:(?:' + - '삼성|닷컴|닷넷|香格里拉|餐厅|食品|飞利浦|電訊盈科|集团|通販|购物|谷歌|诺基亚|联通|网络|网站|网店|网址|组织机构|移动|珠宝|点看|游戏|淡马锡|机构|書籍|时尚|新闻|政府|' + - '政务|手表|手机|我爱你|慈善|微博|广东|工行|家電|娱乐|天主教|大拿|大众汽车|在线|嘉里大酒店|嘉里|商标|商店|商城|公益|公司|八卦|健康|信息|佛山|企业|中文网|中信|世界|' + - 'ポイント|ファッション|セール|ストア|コム|グーグル|クラウド|みんな|คอม|संगठन|नेट|कॉम|همراه|موقع|موبايلي|كوم|كاثوليك|عرب|شبكة|' + - 'بيتك|بازار|العليان|ارامكو|اتصالات|ابوظبي|קום|сайт|рус|орг|онлайн|москва|ком|католик|дети|' + - 'zuerich|zone|zippo|zip|zero|zara|zappos|yun|youtube|you|yokohama|yoga|yodobashi|yandex|yamaxun|' + - 'yahoo|yachts|xyz|xxx|xperia|xin|xihuan|xfinity|xerox|xbox|wtf|wtc|wow|world|works|work|woodside|' + - 'wolterskluwer|wme|winners|wine|windows|win|williamhill|wiki|wien|whoswho|weir|weibo|wedding|wed|' + - 'website|weber|webcam|weatherchannel|weather|watches|watch|warman|wanggou|wang|walter|walmart|' + - 'wales|vuelos|voyage|voto|voting|vote|volvo|volkswagen|vodka|vlaanderen|vivo|viva|vistaprint|' + - 'vista|vision|visa|virgin|vip|vin|villas|viking|vig|video|viajes|vet|versicherung|' + - 'vermögensberatung|vermögensberater|verisign|ventures|vegas|vanguard|vana|vacations|ups|uol|uno|' + - 'university|unicom|uconnect|ubs|ubank|tvs|tushu|tunes|tui|tube|trv|trust|travelersinsurance|' + - 'travelers|travelchannel|travel|training|trading|trade|toys|toyota|town|tours|total|toshiba|' + - 'toray|top|tools|tokyo|today|tmall|tkmaxx|tjx|tjmaxx|tirol|tires|tips|tiffany|tienda|tickets|' + - 'tiaa|theatre|theater|thd|teva|tennis|temasek|telefonica|telecity|tel|technology|tech|team|tdk|' + - 'tci|taxi|tax|tattoo|tatar|tatamotors|target|taobao|talk|taipei|tab|systems|symantec|sydney|' + - 'swiss|swiftcover|swatch|suzuki|surgery|surf|support|supply|supplies|sucks|style|study|studio|' + - 'stream|store|storage|stockholm|stcgroup|stc|statoil|statefarm|statebank|starhub|star|staples|' + - 'stada|srt|srl|spreadbetting|spot|spiegel|space|soy|sony|song|solutions|solar|sohu|software|' + - 'softbank|social|soccer|sncf|smile|smart|sling|skype|sky|skin|ski|site|singles|sina|silk|shriram|' + - 'showtime|show|shouji|shopping|shop|shoes|shiksha|shia|shell|shaw|sharp|shangrila|sfr|sexy|sex|' + - 'sew|seven|ses|services|sener|select|seek|security|secure|seat|search|scot|scor|scjohnson|' + - 'science|schwarz|schule|school|scholarships|schmidt|schaeffler|scb|sca|sbs|sbi|saxo|save|sas|' + - 'sarl|sapo|sap|sanofi|sandvikcoromant|sandvik|samsung|samsclub|salon|sale|sakura|safety|safe|' + - 'saarland|ryukyu|rwe|run|ruhr|rugby|rsvp|room|rogers|rodeo|rocks|rocher|rmit|rip|rio|ril|' + - 'rightathome|ricoh|richardli|rich|rexroth|reviews|review|restaurant|rest|republican|report|' + - 'repair|rentals|rent|ren|reliance|reit|reisen|reise|rehab|redumbrella|redstone|red|recipes|' + - 'realty|realtor|realestate|read|raid|radio|racing|qvc|quest|quebec|qpon|pwc|pub|prudential|pru|' + - 'protection|property|properties|promo|progressive|prof|productions|prod|pro|prime|press|praxi|' + - 'pramerica|post|porn|politie|poker|pohl|pnc|plus|plumbing|playstation|play|place|pizza|pioneer|' + - 'pink|ping|pin|pid|pictures|pictet|pics|piaget|physio|photos|photography|photo|phone|philips|phd|' + - 'pharmacy|pfizer|pet|pccw|pay|passagens|party|parts|partners|pars|paris|panerai|panasonic|' + - 'pamperedchef|page|ovh|ott|otsuka|osaka|origins|orientexpress|organic|org|orange|oracle|open|ooo|' + - 'onyourside|online|onl|ong|one|omega|ollo|oldnavy|olayangroup|olayan|okinawa|office|off|observer|' + - 'obi|nyc|ntt|nrw|nra|nowtv|nowruz|now|norton|northwesternmutual|nokia|nissay|nissan|ninja|nikon|' + - 'nike|nico|nhk|ngo|nfl|nexus|nextdirect|next|news|newholland|new|neustar|network|netflix|netbank|' + - 'net|nec|nba|navy|natura|nationwide|name|nagoya|nadex|nab|mutuelle|mutual|museum|mtr|mtpc|mtn|' + - 'msd|movistar|movie|mov|motorcycles|moto|moscow|mortgage|mormon|mopar|montblanc|monster|money|' + - 'monash|mom|moi|moe|moda|mobily|mobile|mobi|mma|mls|mlb|mitsubishi|mit|mint|mini|mil|microsoft|' + - 'miami|metlife|merckmsd|meo|menu|men|memorial|meme|melbourne|meet|media|med|mckinsey|mcdonalds|' + - 'mcd|mba|mattel|maserati|marshalls|marriott|markets|marketing|market|map|mango|management|man|' + - 'makeup|maison|maif|madrid|macys|luxury|luxe|lupin|lundbeck|ltda|ltd|lplfinancial|lpl|love|lotto|' + - 'lotte|london|lol|loft|locus|locker|loans|loan|lixil|living|live|lipsy|link|linde|lincoln|limo|' + - 'limited|lilly|like|lighting|lifestyle|lifeinsurance|life|lidl|liaison|lgbt|lexus|lego|legal|' + - 'lefrak|leclerc|lease|lds|lawyer|law|latrobe|latino|lat|lasalle|lanxess|landrover|land|lancome|' + - 'lancia|lancaster|lamer|lamborghini|ladbrokes|lacaixa|kyoto|kuokgroup|kred|krd|kpn|kpmg|kosher|' + - 'komatsu|koeln|kiwi|kitchen|kindle|kinder|kim|kia|kfh|kerryproperties|kerrylogistics|kerryhotels|' + - 'kddi|kaufen|juniper|juegos|jprs|jpmorgan|joy|jot|joburg|jobs|jnj|jmp|jll|jlc|jio|jewelry|jetzt|' + - 'jeep|jcp|jcb|java|jaguar|iwc|iveco|itv|itau|istanbul|ist|ismaili|iselect|irish|ipiranga|' + - 'investments|intuit|international|intel|int|insure|insurance|institute|ink|ing|info|infiniti|' + - 'industries|immobilien|immo|imdb|imamat|ikano|iinet|ifm|ieee|icu|ice|icbc|ibm|hyundai|hyatt|' + - 'hughes|htc|hsbc|how|house|hotmail|hotels|hoteles|hot|hosting|host|hospital|horse|honeywell|' + - 'honda|homesense|homes|homegoods|homedepot|holiday|holdings|hockey|hkt|hiv|hitachi|hisamitsu|' + - 'hiphop|hgtv|hermes|here|helsinki|help|healthcare|health|hdfcbank|hdfc|hbo|haus|hangout|hamburg|' + - 'hair|guru|guitars|guide|guge|gucci|guardian|group|grocery|gripe|green|gratis|graphics|grainger|' + - 'gov|got|gop|google|goog|goodyear|goodhands|goo|golf|goldpoint|gold|godaddy|gmx|gmo|gmbh|gmail|' + - 'globo|global|gle|glass|glade|giving|gives|gifts|gift|ggee|george|genting|gent|gea|gdn|gbiz|' + - 'garden|gap|games|game|gallup|gallo|gallery|gal|fyi|futbol|furniture|fund|fun|fujixerox|fujitsu|' + - 'ftr|frontier|frontdoor|frogans|frl|fresenius|free|fox|foundation|forum|forsale|forex|ford|' + - 'football|foodnetwork|food|foo|fly|flsmidth|flowers|florist|flir|flights|flickr|fitness|fit|' + - 'fishing|fish|firmdale|firestone|fire|financial|finance|final|film|fido|fidelity|fiat|ferrero|' + - 'ferrari|feedback|fedex|fast|fashion|farmers|farm|fans|fan|family|faith|fairwinds|fail|fage|' + - 'extraspace|express|exposed|expert|exchange|everbank|events|eus|eurovision|etisalat|esurance|' + - 'estate|esq|erni|ericsson|equipment|epson|epost|enterprises|engineering|engineer|energy|emerck|' + - 'email|education|edu|edeka|eco|eat|earth|dvr|dvag|durban|dupont|duns|dunlop|duck|dubai|dtv|drive|' + - 'download|dot|doosan|domains|doha|dog|dodge|doctor|docs|dnp|diy|dish|discover|discount|directory|' + - 'direct|digital|diet|diamonds|dhl|dev|design|desi|dentist|dental|democrat|delta|deloitte|dell|' + - 'delivery|degree|deals|dealer|deal|dds|dclk|day|datsun|dating|date|data|dance|dad|dabur|cyou|' + - 'cymru|cuisinella|csc|cruises|cruise|crs|crown|cricket|creditunion|creditcard|credit|courses|' + - 'coupons|coupon|country|corsica|coop|cool|cookingchannel|cooking|contractors|contact|consulting|' + - 'construction|condos|comsec|computer|compare|company|community|commbank|comcast|com|cologne|' + - 'college|coffee|codes|coach|clubmed|club|cloud|clothing|clinique|clinic|click|cleaning|claims|' + - 'cityeats|city|citic|citi|citadel|cisco|circle|cipriani|church|chrysler|chrome|christmas|chloe|' + - 'chintai|cheap|chat|chase|channel|chanel|cfd|cfa|cern|ceo|center|ceb|cbs|cbre|cbn|cba|catholic|' + - 'catering|cat|casino|cash|caseih|case|casa|cartier|cars|careers|career|care|cards|caravan|car|' + - 'capitalone|capital|capetown|canon|cancerresearch|camp|camera|cam|calvinklein|call|cal|cafe|cab|' + - 'bzh|buzz|buy|business|builders|build|bugatti|budapest|brussels|brother|broker|broadway|' + - 'bridgestone|bradesco|box|boutique|bot|boston|bostik|bosch|boots|booking|book|boo|bond|bom|bofa|' + - 'boehringer|boats|bnpparibas|bnl|bmw|bms|blue|bloomberg|blog|blockbuster|blanco|blackfriday|' + - 'black|biz|bio|bingo|bing|bike|bid|bible|bharti|bet|bestbuy|best|berlin|bentley|beer|beauty|' + - 'beats|bcn|bcg|bbva|bbt|bbc|bayern|bauhaus|basketball|baseball|bargains|barefoot|barclays|' + - 'barclaycard|barcelona|bar|bank|band|bananarepublic|banamex|baidu|baby|azure|axa|aws|avianca|' + - 'autos|auto|author|auspost|audio|audible|audi|auction|attorney|athleta|associates|asia|asda|arte|' + - 'art|arpa|army|archi|aramco|arab|aquarelle|apple|app|apartments|aol|anz|anquan|android|analytics|' + - 'amsterdam|amica|amfam|amex|americanfamily|americanexpress|alstom|alsace|ally|allstate|allfinanz|' + - 'alipay|alibaba|alfaromeo|akdn|airtel|airforce|airbus|aigo|aig|agency|agakhan|africa|afl|' + - 'afamilycompany|aetna|aero|aeg|adult|ads|adac|actor|active|aco|accountants|accountant|accenture|' + - 'academy|abudhabi|abogado|able|abc|abbvie|abbott|abb|abarth|aarp|aaa|onion' + - ')(?=[^0-9a-zA-Z@]|$))')); - regexen.validCCTLD = regexSupplant(RegExp( - '(?:(?:' + - '한국|香港|澳門|新加坡|台灣|台湾|中國|中国|გე|ไทย|ලංකා|ഭാരതം|ಭಾರತ|భారత్|சிங்கப்பூர்|இலங்கை|இந்தியா|ଭାରତ|ભારત|ਭਾਰਤ|' + - 'ভাৰত|ভারত|বাংলা|भारोत|भारतम्|भारत|ڀارت|پاکستان|مليسيا|مصر|قطر|فلسطين|عمان|عراق|سورية|سودان|تونس|' + - 'بھارت|بارت|ایران|امارات|المغرب|السعودية|الجزائر|الاردن|հայ|қаз|укр|срб|рф|мон|мкд|ею|бел|бг|ελ|' + - 'zw|zm|za|yt|ye|ws|wf|vu|vn|vi|vg|ve|vc|va|uz|uy|us|um|uk|ug|ua|tz|tw|tv|tt|tr|tp|to|tn|tm|tl|tk|' + - 'tj|th|tg|tf|td|tc|sz|sy|sx|sv|su|st|ss|sr|so|sn|sm|sl|sk|sj|si|sh|sg|se|sd|sc|sb|sa|rw|ru|rs|ro|' + - 're|qa|py|pw|pt|ps|pr|pn|pm|pl|pk|ph|pg|pf|pe|pa|om|nz|nu|nr|np|no|nl|ni|ng|nf|ne|nc|na|mz|my|mx|' + - 'mw|mv|mu|mt|ms|mr|mq|mp|mo|mn|mm|ml|mk|mh|mg|mf|me|md|mc|ma|ly|lv|lu|lt|ls|lr|lk|li|lc|lb|la|kz|' + - 'ky|kw|kr|kp|kn|km|ki|kh|kg|ke|jp|jo|jm|je|it|is|ir|iq|io|in|im|il|ie|id|hu|ht|hr|hn|hm|hk|gy|gw|' + - 'gu|gt|gs|gr|gq|gp|gn|gm|gl|gi|gh|gg|gf|ge|gd|gb|ga|fr|fo|fm|fk|fj|fi|eu|et|es|er|eh|eg|ee|ec|dz|' + - 'do|dm|dk|dj|de|cz|cy|cx|cw|cv|cu|cr|co|cn|cm|cl|ck|ci|ch|cg|cf|cd|cc|ca|bz|by|bw|bv|bt|bs|br|bq|' + - 'bo|bn|bm|bl|bj|bi|bh|bg|bf|be|bd|bb|ba|az|ax|aw|au|at|as|ar|aq|ao|an|am|al|ai|ag|af|ae|ad|ac' + - ')(?=[^0-9a-zA-Z@]|$))')); - regexen.validPunycode = /(?:xn--[0-9a-z]+)/; - regexen.validSpecialCCTLD = /(?:(?:co|tv)(?=[^0-9a-zA-Z@]|$))/; - regexen.validDomain = regexSupplant(/(?:#{validSubdomain}*#{validDomainName}(?:#{validGTLD}|#{validCCTLD}|#{validPunycode}))/); - regexen.validPortNumber = /[0-9]+/; - regexen.pd = /\u002d\u058a\u05be\u1400\u1806\u2010-\u2015\u2e17\u2e1a\u2e3a\u2e40\u301c\u3030\u30a0\ufe31\ufe58\ufe63\uff0d/; - regexen.validGeneralUrlPathChars = regexSupplant(/[^#{spaces_group}\(\)\?]/i); - // Allow URL paths to contain up to two nested levels of balanced parens - // 1. Used in Wikipedia URLs like /Primer_(film) - // 2. Used in IIS sessions like /S(dfd346)/ - // 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/ - regexen.validUrlBalancedParens = regexSupplant( - '\\(' + - '(?:' + - '#{validGeneralUrlPathChars}+' + - '|' + - // allow one nested level of balanced parentheses - '(?:' + - '#{validGeneralUrlPathChars}*' + - '\\(' + - '#{validGeneralUrlPathChars}+' + - '\\)' + - '#{validGeneralUrlPathChars}*' + - ')' + - ')' + - '\\)', - 'i'); - // Valid end-of-path characters (so /foo. does not gobble the period). - // 1. Allow =&# for empty URL parameters and other URL-join artifacts - regexen.validUrlPathEndingChars = regexSupplant(/[^#{spaces_group}\(\)\?!\*';:=\,\.\$%\[\]#{pd}~&\|@]|(?:#{validUrlBalancedParens})/i); - // Allow @ in a url, but only in the middle. Catch things like http://example.com/@user/ - regexen.validUrlPath = regexSupplant('(?:' + - '(?:' + - '#{validGeneralUrlPathChars}*' + - '(?:#{validUrlBalancedParens}#{validGeneralUrlPathChars}*)*' + - '#{validUrlPathEndingChars}'+ - ')|(?:@#{validGeneralUrlPathChars}+\/)'+ - ')', 'i'); - regexen.validUrlQueryChars = /[a-z0-9!?\*'@\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i; - regexen.validUrlQueryEndingChars = /[a-z0-9_&=#\/]/i; - regexen.validUrl = regexSupplant( - '(' + // $1 URL - '(https?:\\/\\/)' + // $2 Protocol - '(#{validDomain})' + // $3 Domain(s) - '(?::(#{validPortNumber}))?' + // $4 Port number (optional) - '(\\/#{validUrlPath}*)?' + // $5 URL Path - '(\\?#{validUrlQueryChars}*#{validUrlQueryEndingChars})?' + // $6 Query String - ')', - 'gi'); - return regexen.validUrl; -}()); +export const urlRegex = regexSupplant( + '(' + // $1 URL + '(#{validUrlPrecedingChars})' + // $2 + '(https?:\\/\\/)' + // $3 Protocol + '(#{validDomain})' + // $4 Domain(s) + '(?::(#{validPortNumber}))?' + // $5 Port number (optional) + '(\\/#{validUrlPath}*)?' + // $6 URL Path + '(\\?#{validUrlQueryChars}*#{validUrlQueryEndingChars})?' + // $7 Query String + ')', + { + validUrlPrecedingChars, + validDomain, + validPortNumber, + validUrlPath, + validUrlQueryChars, + validUrlQueryEndingChars, + }, + 'gi', +); diff --git a/app/lib/extractor.rb b/app/lib/extractor.rb index 6076458ad6e..8020aa91660 100644 --- a/app/lib/extractor.rb +++ b/app/lib/extractor.rb @@ -1,20 +1,20 @@ # frozen_string_literal: true module Extractor - extend Twitter::Extractor + extend Twitter::TwitterText::Extractor module_function # :yields: username, list_slug, start, end def extract_mentions_or_lists_with_indices(text) - return [] unless Twitter::Regex[:at_signs].match?(text) + return [] unless Twitter::TwitterText::Regex[:at_signs].match?(text) possible_entries = [] text.to_s.scan(Account::MENTION_RE) do |screen_name, _| match_data = $LAST_MATCH_INFO after = $' - unless Twitter::Regex[:end_mention_match].match?(after) + unless Twitter::TwitterText::Regex[:end_mention_match].match?(after) start_position = match_data.char_begin(1) - 1 end_position = match_data.char_end(1) possible_entries << { @@ -44,7 +44,7 @@ module Extractor if %r{\A://}.match?(after) hash_text.match(/(.+)(https?\Z)/) do |matched| hash_text = matched[1] - end_position -= matched[2].char_length + end_position -= matched[2].codepoint_length end end diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb index 24a34a05945..7252234d6a9 100644 --- a/app/lib/formatter.rb +++ b/app/lib/formatter.rb @@ -260,7 +260,7 @@ class Formatter html_attrs[:rel] = "me #{html_attrs[:rel]}" if options[:me] - Twitter::Autolink.send(:link_to_text, entity, link_html(entity[:url]), url, html_attrs) + Twitter::TwitterText::Autolink.send(:link_to_text, entity, link_html(entity[:url]), url, html_attrs) rescue Addressable::URI::InvalidURIError, IDN::Idna::IdnaError encode(entity[:url]) end diff --git a/app/services/fetch_link_card_service.rb b/app/services/fetch_link_card_service.rb index 74fe9a0a581..d4e4931e670 100644 --- a/app/services/fetch_link_card_service.rb +++ b/app/services/fetch_link_card_service.rb @@ -2,12 +2,12 @@ class FetchLinkCardService < BaseService URL_PATTERN = %r{ - ( # $1 URL - (https?:\/\/) # $2 Protocol (required) - (#{Twitter::Regex[:valid_domain]}) # $3 Domain(s) - (?::(#{Twitter::Regex[:valid_port_number]}))? # $4 Port number (optional) - (/#{Twitter::Regex[:valid_url_path]}*)? # $5 URL Path and anchor - (\?#{Twitter::Regex[:valid_url_query_chars]}*#{Twitter::Regex[:valid_url_query_ending_chars]})? # $6 Query String + ( # $1 URL + (https?:\/\/) # $2 Protocol (required) + (#{Twitter::TwitterText::Regex[:valid_domain]}) # $3 Domain(s) + (?::(#{Twitter::TwitterText::Regex[:valid_port_number]}))? # $4 Port number (optional) + (/#{Twitter::TwitterText::Regex[:valid_url_path]}*)? # $5 URL Path and anchor + (\?#{Twitter::TwitterText::Regex[:valid_url_query_chars]}*#{Twitter::TwitterText::Regex[:valid_url_query_ending_chars]})? # $6 Query String ) }iox diff --git a/app/validators/status_length_validator.rb b/app/validators/status_length_validator.rb index 93bae2fa840..b56c5a3212e 100644 --- a/app/validators/status_length_validator.rb +++ b/app/validators/status_length_validator.rb @@ -2,6 +2,13 @@ class StatusLengthValidator < ActiveModel::Validator MAX_CHARS = 500 + URL_PATTERN = %r{ + (?: + (#{Twitter::TwitterText::Regex[:valid_url_preceding_chars]}) + (#{FetchLinkCardService::URL_PATTERN}) + ) + }iox + URL_PLACEHOLDER = "\1#{'x' * 23}" def validate(status) return unless status.local? && !status.reblog? @@ -28,7 +35,7 @@ class StatusLengthValidator < ActiveModel::Validator return '' if @status.text.nil? @status.text.dup.tap do |new_text| - new_text.gsub!(FetchLinkCardService::URL_PATTERN, 'x' * 23) + new_text.gsub!(URL_PATTERN, URL_PLACEHOLDER) new_text.gsub!(Account::MENTION_RE, '@\2') end end diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb index aca85dd43a5..3ff2aa9e5de 100644 --- a/config/initializers/twitter_regex.rb +++ b/config/initializers/twitter_regex.rb @@ -1,4 +1,10 @@ -module Twitter +module Twitter::TwitterText + class Configuration + def emoji_parsing_enabled + false + end + end + class Regex REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou @@ -79,7 +85,7 @@ module Twitter return [] unless text && text.index(":") urls = [] - text.to_s.scan(Twitter::Regex[:valid_extended_uri]) do + text.to_s.scan(Twitter::TwitterText::Regex[:valid_extended_uri]) do valid_uri_match_data = $~ start_position = valid_uri_match_data.char_begin(3) diff --git a/package.json b/package.json index 111c2f20959..bff9db5667d 100644 --- a/package.json +++ b/package.json @@ -162,6 +162,7 @@ "tesseract.js": "^2.1.1", "throng": "^4.0.0", "tiny-queue": "^0.2.1", + "twitter-text": "3.1.0", "uuid": "^8.3.1", "webpack": "^4.46.0", "webpack-assets-manifest": "^4.0.1", diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb index 633d59c2abc..5c88a2569ac 100644 --- a/spec/lib/formatter_spec.rb +++ b/spec/lib/formatter_spec.rb @@ -21,6 +21,14 @@ RSpec.describe Formatter do end end + context 'given a stand-alone URL with a newer TLD' do + let(:text) { 'http://example.gay' } + + it 'matches the full URL' do + is_expected.to include 'href="http://example.gay"' + end + end + context 'given a stand-alone IDN URL' do let(:text) { 'https://nic.みんな/' } diff --git a/spec/validators/status_length_validator_spec.rb b/spec/validators/status_length_validator_spec.rb index 11e55f9335f..bef3f29f50c 100644 --- a/spec/validators/status_length_validator_spec.rb +++ b/spec/validators/status_length_validator_spec.rb @@ -42,6 +42,14 @@ describe StatusLengthValidator do expect(status.errors).to_not have_received(:add) end + it 'does not count non-autolinkable URLs as 23 characters flat' do + text = ('a' * 476) + "http://#{'b' * 30}.com/example" + status = double(spoiler_text: '', text: text, errors: double(add: nil), local?: true, reblog?: false) + + subject.validate(status) + expect(status.errors).to have_received(:add) + end + it 'counts only the front part of remote usernames' do text = ('a' * 475) + " @alice@#{'b' * 30}.com" status = double(spoiler_text: '', text: text, errors: double(add: nil), local?: true, reblog?: false) diff --git a/yarn.lock b/yarn.lock index 30da074bc8f..4d6fb4b2209 100644 --- a/yarn.lock +++ b/yarn.lock @@ -939,7 +939,7 @@ dependencies: regenerator-runtime "^0.12.0" -"@babel/runtime@^7.1.2", "@babel/runtime@^7.10.2", "@babel/runtime@^7.11.2", "@babel/runtime@^7.12.1", "@babel/runtime@^7.12.5", "@babel/runtime@^7.13.8", "@babel/runtime@^7.2.0", "@babel/runtime@^7.4.4", "@babel/runtime@^7.5.5", "@babel/runtime@^7.6.3", "@babel/runtime@^7.7.2", "@babel/runtime@^7.8.4", "@babel/runtime@^7.9.2": +"@babel/runtime@^7.1.2", "@babel/runtime@^7.10.2", "@babel/runtime@^7.11.2", "@babel/runtime@^7.12.1", "@babel/runtime@^7.12.5", "@babel/runtime@^7.13.8", "@babel/runtime@^7.2.0", "@babel/runtime@^7.3.1", "@babel/runtime@^7.4.4", "@babel/runtime@^7.5.5", "@babel/runtime@^7.6.3", "@babel/runtime@^7.7.2", "@babel/runtime@^7.8.4", "@babel/runtime@^7.9.2": version "7.13.8" resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.13.8.tgz#cc886a85c072df1de23670dc1aa59fc116c4017c" integrity sha512-CwQljpw6qSayc0fRG1soxHAKs1CnQMOChm4mlQP6My0kf9upVGizj/KhlTTgyUnETmHpcUXjaluNAkteRFuafg== @@ -3217,6 +3217,11 @@ core-js@^2.4.0: resolved "https://registry.yarnpkg.com/core-js/-/core-js-2.6.11.tgz#38831469f9922bded8ee21c9dc46985e0399308c" integrity sha512-5wjnpaT/3dV+XB4borEsnAYQchn00XSgTAWKDkEqv+K8KevjbzmofK6hfJ9TZIlpj2N0xQpazy7PiRQiWHqzWg== +core-js@^2.5.0: + version "2.6.12" + resolved "https://registry.yarnpkg.com/core-js/-/core-js-2.6.12.tgz#d9333dfa7b065e347cc5682219d6f690859cc2ec" + integrity sha512-Kb2wC0fvsWfQrgk8HU5lW6U/Lcs8+9aaYcy4ZFc6DDlo4nZ7n70dEgE5rtR0oG6ufKDUnrwfWL1mXR5ljDatrQ== + core-util-is@1.0.2, core-util-is@~1.0.0: version "1.0.2" resolved "https://registry.yarnpkg.com/core-util-is/-/core-util-is-1.0.2.tgz#b5fd54220aa2bc5ab57aab7140c940754503c1a7" @@ -8649,7 +8654,7 @@ punycode@1.3.2: resolved "https://registry.yarnpkg.com/punycode/-/punycode-1.3.2.tgz#9653a036fb7c1ee42342f2325cceefea3926c48d" integrity sha1-llOgNvt8HuQjQvIyXM7v6jkmxI0= -punycode@^1.2.4: +punycode@1.4.1, punycode@^1.2.4: version "1.4.1" resolved "https://registry.yarnpkg.com/punycode/-/punycode-1.4.1.tgz#c0d5a63b2718800ad8e1eb0fa5269c84dd41845e" integrity sha1-wNWmOycYgArY4esPpSachN1BhF4= @@ -10722,6 +10727,21 @@ tweetnacl@^0.14.3, tweetnacl@~0.14.0: resolved "https://registry.yarnpkg.com/tweetnacl/-/tweetnacl-0.14.5.tgz#5ae68177f192d4456269d108afa93ff8743f4f64" integrity sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q= +twemoji-parser@^11.0.2: + version "11.0.2" + resolved "https://registry.yarnpkg.com/twemoji-parser/-/twemoji-parser-11.0.2.tgz#24e87c2008abe8544c962f193b88b331de32b446" + integrity sha512-5kO2XCcpAql6zjdLwRwJjYvAZyDy3+Uj7v1ipBzLthQmDL7Ce19bEqHr3ImSNeoSW2OA8u02XmARbXHaNO8GhA== + +twitter-text@3.1.0: + version "3.1.0" + resolved "https://registry.yarnpkg.com/twitter-text/-/twitter-text-3.1.0.tgz#798e932b289f506efe2a1f03fe917ba30627f125" + integrity sha512-nulfUi3FN6z0LUjYipJid+eiwXvOLb8Ass7Jy/6zsXmZK3URte043m8fL3FyDzrK+WLpyqhHuR/TcARTN/iuGQ== + dependencies: + "@babel/runtime" "^7.3.1" + core-js "^2.5.0" + punycode "1.4.1" + twemoji-parser "^11.0.2" + type-check@^0.4.0, type-check@~0.4.0: version "0.4.0" resolved "https://registry.yarnpkg.com/type-check/-/type-check-0.4.0.tgz#07b8203bfa7056c0657050e3ccd2c37730bab8f1"