User:Daniel Mietchen/Sandbox4URLshortening
Jump to navigation
Jump to search
This page is used in conjunction with m:Special:UrlShortener as a workaround to https://phabricator.wikimedia.org/T220703 . URL shortening can also be triggered via the MediaWiki API. Another option for URL shortening is Query Chest.
The following query uses these:
- Items: Die Gartenlaube (Q655617)
- Properties: published in (P1433) , main subject (P921) , title (P1476) , KIT Linked Open Numbers ID (P5176) , numeric value (P1181)
# Most frequent n-grams from a random set of publications in the Gartenlaube which are missing main subject tags SELECT DISTINCT ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle WITH { # Generating a list of entities to be analyzed SELECT ?Publication { SERVICE bd:sample { ?Publication wdt:P1433 wd:Q655617 . bd:serviceParam bd:sample.limit 10000 } FILTER NOT EXISTS { ?Publication wdt:P921 ?Schlagwort. } } } AS %items WITH { # Preprocessing the titles SELECT ?Title ?Publication ?Seeds ?ClearTitleLength { INCLUDE %items ?Publication wdt:P1476 ?Title. BIND (REPLACE(STR(?Title),"[\\.:,;\\[\\]\\?()$]","") AS ?ClearTitle) # remove some frequent special characters, including colons and semicolons BIND(STRLEN(?ClearTitle) AS ?ClearTitleLength) FILTER(LANG(?Title)="de") # Basic processing of the titles BIND ("::: ::: ::: ::: ::: ::: ::: ::: " AS ?StartCodon) BIND (" ;;; ;;; ;;; ;;; ;;; ;;; ;;; ;;;" AS ?StopCodon) BIND (LCASE(CONCAT(?StartCodon , # add start codon of colons to assist with processing of n-grams at beginning of title ?ClearTitle, ?StopCodon)) # add stop codon of semicolons to assist with processing of n-grams at end of title AS ?Seeds ) } } AS %titles WITH { # Generating a list of regexes to look for the NumericValue-th word in a string # Based on https://w.wiki/KG$ by Jura1 SELECT ?Regex1 ?Regex2 ?Regex3 ?Regex4 ?NumericValue { ?NumberItem wdt:P5176 []; wdt:P1181 ?NumericValue . FILTER( ?NumericValue > 0 ) FILTER( ?NumericValue < 151) BIND("^([^ ]+ ){" AS ?RegexStart) BIND("}([^ ]+) .*" AS ?RegexEnd) BIND( CONCAT( ?RegexStart , STR( ?NumericValue - 1 ), ?RegexEnd ) AS ?Regex1) BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 1 ), ?RegexEnd ) AS ?Regex2) BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 3 ), ?RegexEnd ) AS ?Regex3) BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 5 ), ?RegexEnd ) AS ?Regex4) } } AS %regexes WITH { # Applying the regexes to the titles to extract ngrams (for n <= 8), and counting occurrences of the ngrams across titles SELECT DISTINCT ?Ngram ?N (COUNT(DISTINCT ?Title) AS ?Count) ?Length ?Dashes (( ?Count * ?Length * ( (?Dashes +1) / ?N) ) AS ?Score) (SAMPLE(DISTINCT ?Publication) AS ?ExamplePub) { INCLUDE %regexes INCLUDE %titles BIND( (CONCAT( REPLACE(?Seeds, ?Regex1, "$1"), " ", REPLACE(?Seeds, ?Regex1, "$2"), " ", REPLACE(?Seeds, ?Regex2, "$1"), " ", REPLACE(?Seeds, ?Regex2, "$2"), " ", REPLACE(?Seeds, ?Regex3, "$1"), " ", REPLACE(?Seeds, ?Regex3, "$2"), " ", REPLACE(?Seeds, ?Regex4, "$1"), " ", REPLACE(?Seeds, ?Regex4, "$2") ) ) AS ?NgramCandidate) BIND( (REPLACE (REPLACE (REPLACE (REPLACE (STR(?NgramCandidate),"([;:])",""), "(^\\s+)",""), "(\\s+$)",""), "([ ]{2,})"," ") ) AS ?Ngram) BIND(STRLEN(?Ngram) AS ?Length) FILTER (?Length > 3 ) FILTER (?Length <= ?ClearTitleLength ) BIND(STRLEN(REPLACE(?Ngram, "\\S", "")) + 1 as ?N) BIND((STRLEN(?Ngram) - STRLEN(REPLACE(?Ngram, "-", ""))) as ?Dashes) } GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub # HAVING(?Count > 1) } AS %ngrams WHERE { INCLUDE %ngrams # Exclude Ngrams starting or ending with any of a set of blacklisted words BIND("(a|and|between|during|for|from|in|of|on|or|the|to|with)" AS ?blacklist) BIND( CONCAT( "(^", ?blacklist ,")+( )+") AS ?RegexBlackStart) BIND( CONCAT( "( )+(", ?blacklist ,")+$") AS ?RegexBlackEnd) FILTER (!REGEX(?Ngram, ?RegexBlackStart)) FILTER (!REGEX(?Ngram, ?RegexBlackEnd)) # # Exclude Ngrams too similar to the target # FILTER (!CONTAINS(?Ngram, "climate")) # FILTER (!CONTAINS(?Ngram, "change")) ?ExamplePub wdt:P1476 ?ExamplePubTitle. FILTER(LANG(?ExamplePubTitle)="de") } GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle ORDER BY DESC(?Score) DESC(?Count) DESC(?Length) LIMIT 200