User:Daniel Mietchen/Sandbox4URLshortening

From Wikidata
Jump to navigation Jump to search

This page is used in conjunction with m:Special:UrlShortener as a workaround to . URL shortening can also be triggered via the MediaWiki API. Another option for URL shortening is Query Chest.

The following query uses these:

  • Properties: published in (P1433)  View with Reasonator View with SQID, main subject (P921)  View with Reasonator View with SQID, title (P1476)  View with Reasonator View with SQID, KIT Linked Open Numbers ID (P5176)  View with Reasonator View with SQID, numeric value (P1181)  View with Reasonator View with SQID
    # Most frequent n-grams from a random set of publications in the Gartenlaube which are missing main subject tags
    DISTINCT ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
    { # Generating a list of entities to be analyzed
      SELECT ?Publication
          SERVICE bd:sample { ?Publication wdt:P1433 wd:Q655617 . bd:serviceParam bd:sample.limit 10000 }   
          FILTER NOT EXISTS { ?Publication wdt:P921 ?Schlagwort. }
    } AS %items 
    { # Preprocessing the titles
      SELECT ?Title ?Publication ?Seeds ?ClearTitleLength
          INCLUDE %items
          ?Publication wdt:P1476 ?Title.
          BIND (REPLACE(STR(?Title),"[\\.:,;\\[\\]\\?()$]","") AS ?ClearTitle) # remove some frequent special characters, including colons and semicolons
          BIND(STRLEN(?ClearTitle) AS ?ClearTitleLength) 
          # Basic processing of the titles
          BIND ("::: ::: ::: ::: ::: ::: ::: ::: " AS ?StartCodon)
          BIND (" ;;; ;;; ;;; ;;; ;;; ;;; ;;; ;;;" AS ?StopCodon)
          BIND (LCASE(CONCAT(?StartCodon , # add start codon of colons to assist with processing of n-grams at beginning of title
                                ?StopCodon)) # add stop codon of semicolons to assist with processing of n-grams at end of title
                         AS ?Seeds )
    } AS %titles 
    { # Generating a list of regexes to look for the NumericValue-th word in a string     
      # Based on$ by Jura1
      SELECT ?Regex1 ?Regex2 ?Regex3 ?Regex4 ?NumericValue 
          ?NumberItem wdt:P5176 []; wdt:P1181 ?NumericValue . 
          FILTER( ?NumericValue > 0 ) 
          FILTER( ?NumericValue < 151)
          BIND("^([^ ]+ ){" AS ?RegexStart)
          BIND("}([^ ]+) .*" AS ?RegexEnd)
          BIND( CONCAT( ?RegexStart , STR( ?NumericValue - 1 ), ?RegexEnd ) AS ?Regex1)
          BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 1 ), ?RegexEnd ) AS ?Regex2) 
          BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 3 ), ?RegexEnd ) AS ?Regex3) 
          BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 5 ), ?RegexEnd ) AS ?Regex4) 
    } AS %regexes 
    { # Applying the regexes to the titles to extract ngrams (for n <= 8), and counting occurrences of the ngrams across titles
        DISTINCT ?Ngram 
        (COUNT(DISTINCT ?Title) AS ?Count)
        (( ?Count * ?Length * ( (?Dashes +1) / ?N) 
         ) AS ?Score)
        (SAMPLE(DISTINCT ?Publication) AS ?ExamplePub)
            INCLUDE %regexes
            INCLUDE %titles
                REPLACE(?Seeds, ?Regex1, "$1"), " ", 
                REPLACE(?Seeds, ?Regex1, "$2"), " ", 
                REPLACE(?Seeds, ?Regex2, "$1"), " ", 
                REPLACE(?Seeds, ?Regex2, "$2"), " ", 
                REPLACE(?Seeds, ?Regex3, "$1"), " ", 
                REPLACE(?Seeds, ?Regex3, "$2"), " ", 
                REPLACE(?Seeds, ?Regex4, "$1"), " ", 
                REPLACE(?Seeds, ?Regex4, "$2")
            ) AS ?NgramCandidate) 
                "([ ]{2,})"," ")
              ) AS ?Ngram) 
            BIND(STRLEN(?Ngram) AS ?Length) 
            FILTER (?Length > 3 )  
            FILTER (?Length <= ?ClearTitleLength )  
            BIND(STRLEN(REPLACE(?Ngram, "\\S", "")) + 1 as ?N)
            BIND((STRLEN(?Ngram) - STRLEN(REPLACE(?Ngram, "-", "")))  as ?Dashes)
      GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub
    #   HAVING(?Count > 1)
    } AS %ngrams 
    WHERE {
      INCLUDE %ngrams 
      # Exclude Ngrams starting or ending with any of a set of blacklisted words
      BIND("(a|and|between|during|for|from|in|of|on|or|the|to|with)" AS ?blacklist)
      BIND( CONCAT( "(^", ?blacklist ,")+( )+") AS ?RegexBlackStart)
      BIND( CONCAT( "( )+(", ?blacklist ,")+$") AS ?RegexBlackEnd)
      FILTER (!REGEX(?Ngram, ?RegexBlackStart))
      FILTER (!REGEX(?Ngram, ?RegexBlackEnd))
    #   # Exclude Ngrams too similar to the target
    #   FILTER (!CONTAINS(?Ngram, "climate"))
    #   FILTER (!CONTAINS(?Ngram, "change"))
      ?ExamplePub wdt:P1476 ?ExamplePubTitle.
    GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
    ORDER BY DESC(?Score) DESC(?Count) DESC(?Length)
    LIMIT 200