[LIB-13] Method scanUrl implementation for several scanners
authorgotty <gotty@hedgecode.org>
Fri, 17 Jan 2020 23:51:29 +0000 (02:51 +0300)
committergotty <gotty@hedgecode.org>
Fri, 17 Jan 2020 23:51:29 +0000 (02:51 +0300)
pom.xml
src/main/java/org/hedgecode/chess/scanner/ChessHogScannerConstants.java
src/main/java/org/hedgecode/chess/scanner/Settings.java
src/main/java/org/hedgecode/chess/scanner/json/JSONSettings.java
src/main/java/org/hedgecode/chess/scanner/portal/AbstractSettingsScanner.java
src/main/java/org/hedgecode/chess/scanner/portal/ChessBombScanner.java
src/main/java/org/hedgecode/chess/scanner/portal/ChessGamesScanner.java
src/main/java/org/hedgecode/chess/scanner/portal/LiChessScanner.java
src/main/resources/settings/chessbomb.settings
src/main/resources/settings/chessgames.settings
src/main/resources/settings/lichess.settings

diff --git a/pom.xml b/pom.xml
index fb01884..08b52bd 100644 (file)
--- a/pom.xml
+++ b/pom.xml
@@ -64,6 +64,7 @@
         <chessHogVersion>0.1-SNAPSHOT</chessHogVersion>
         <httpCoreVersion>4.4.11</httpCoreVersion>
         <httpClientVersion>4.5.9</httpClientVersion>
+        <commonsTextVersion>1.8</commonsTextVersion>
         <gsonVersion>2.8.0</gsonVersion>
         <junitVersion>4.12</junitVersion>
         <commonsConfigVersion>1.10</commonsConfigVersion>
             <version>${httpClientVersion}</version>
         </dependency>
         <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-text</artifactId>
+            <version>${commonsTextVersion}</version>
+        </dependency>
+        <dependency>
             <groupId>com.google.code.gson</groupId>
             <artifactId>gson</artifactId>
             <version>${gsonVersion}</version>
index 50eb391..fba0733 100644 (file)
@@ -42,6 +42,8 @@ public final class ChessHogScannerConstants {
     public static final String DOMAIN_CHESS24 = "chess24.com";
     public static final String DOMAIN_CHESSCOM = "chess.com";
 
+    public static final String PGN_DETECT_REGEX = "^\\[Event \"[^\"]+\"\\]$";
+
     public static final String PROXY_UNDEFINED = "undefined";
     public static final String PROXY_HTTP = "http";
     public static final String PROXY_SOCKS = "socks";
index 6a14f9b..27be0fc 100644 (file)
@@ -31,6 +31,10 @@ public interface Settings {
 
     String getTournamentGamesUrlRegex();
 
+    String getTournamentJsonUrlRegex();
+
+    String getTournamentNameRegex();
+
     String getTournamentQuery();
 
     String getTournamentQueryUrlRegex();
@@ -43,4 +47,8 @@ public interface Settings {
 
     String getGameUrlRegex();
 
+    String getGameJsonUrlRegex();
+
+    String getGameIdRegex();
+
 }
index 474acbe..826b343 100644 (file)
@@ -39,6 +39,12 @@ public class JSONSettings implements Settings {
     @SerializedName("tournamentGamesUrlRegex")
     private String tournamentGamesUrlRegex;
 
+    @SerializedName("tournamentJsonUrlRegex")
+    private String tournamentJsonUrlRegex;
+
+    @SerializedName("tournamentNameRegex")
+    private String tournamentNameRegex;
+
     @SerializedName("tournamentQuery")
     private String tournamentQuery;
 
@@ -57,6 +63,12 @@ public class JSONSettings implements Settings {
     @SerializedName("gameUrlRegex")
     private String gameUrlRegex;
 
+    @SerializedName("gameJsonUrlRegex")
+    private String gameJsonUrlRegex;
+
+    @SerializedName("gameIdRegex")
+    private String gameIdRegex;
+
     @Override
     public String getTournamentUrl() {
         return tournamentUrl;
@@ -78,6 +90,16 @@ public class JSONSettings implements Settings {
     }
 
     @Override
+    public String getTournamentJsonUrlRegex() {
+        return tournamentJsonUrlRegex;
+    }
+
+    @Override
+    public String getTournamentNameRegex() {
+        return tournamentNameRegex;
+    }
+
+    @Override
     public String getTournamentQuery() {
         return tournamentQuery;
     }
@@ -107,4 +129,14 @@ public class JSONSettings implements Settings {
         return gameUrlRegex;
     }
 
+    @Override
+    public String getGameJsonUrlRegex() {
+        return gameJsonUrlRegex;
+    }
+
+    @Override
+    public String getGameIdRegex() {
+        return gameIdRegex;
+    }
+
 }
index 0910047..445b603 100644 (file)
@@ -16,6 +16,9 @@
 
 package org.hedgecode.chess.scanner.portal;
 
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 import org.hedgecode.chess.scanner.ChessHogScannerException;
 import org.hedgecode.chess.scanner.Initiable;
 import org.hedgecode.chess.scanner.Settings;
@@ -24,6 +27,7 @@ import org.hedgecode.chess.scanner.regex.RegexBuilder;
 import org.hedgecode.chess.scanner.regex.RegexParams;
 import org.hedgecode.chess.scanner.spi.ServiceRegistry;
 
+import static org.hedgecode.chess.scanner.ChessHogScannerConstants.*;
 import static org.hedgecode.chess.scanner.regex.RegexBuilder.Type;
 
 /**
@@ -102,6 +106,18 @@ public abstract class AbstractSettingsScanner extends AbstractRequestScanner imp
         );
     }
 
+    protected String regex(String source, String regex) {
+        Matcher matcher = Pattern.compile(regex, Pattern.MULTILINE).matcher(source);
+        if (matcher.find()) {
+            return matcher.groupCount() > 0 ? matcher.group(1) : matcher.group();
+        }
+        return null;
+    }
+
+    protected boolean isPgnFormat(String source) {
+        return regex(source, PGN_DETECT_REGEX) != null;
+    }
+
     private String assignUrlWithParams(String url, String params) {
         return params != null
                 ? url.concat(params)
index 7b19256..9084f5a 100644 (file)
@@ -44,7 +44,7 @@ public class ChessBombScanner extends AbstractSettingsScanner {
     public PGNTournament scanTournament(String tournamentId) throws ChessHogScannerException {
         String decodeTournament = decodeUrlByRegex(
                 assignUrl(tournamentId, null),
-                getSettings().getTournamentGamesUrlRegex()
+                getSettings().getTournamentJsonUrlRegex()
         );
 
         TournamentFormat tournamentFormat = Format.formatTournament(decodeTournament);
@@ -87,9 +87,24 @@ public class ChessBombScanner extends AbstractSettingsScanner {
 
     @Override
     public PGNGame scanGame(String gameId, String tournamentId) throws ChessHogScannerException {
-        String decodeGame = decodeUrlByRegex(
+        return scanGameByRegex(
                 assignUrl(gameId, tournamentId, true),
-                getSettings().getGameUrlRegex()
+                getSettings().getGameJsonUrlRegex()
+        );
+    }
+
+    @Override
+    public PGNGame scanUrl(String gameUrl) throws ChessHogScannerException {
+        return scanGameByRegex(
+                gameUrl,
+                getSettings().getGameJsonUrlRegex()
+        );
+    }
+
+    private PGNGame scanGameByRegex(String gameUrl, String regex) throws ChessHogScannerException {
+        String decodeGame = decodeUrlByRegex(
+                gameUrl,
+                regex
         );
 
         GameFormat gameFormat = Format.formatGame(decodeGame);
@@ -100,18 +115,21 @@ public class ChessBombScanner extends AbstractSettingsScanner {
         );
     }
 
-    @Override
-    public PGNGame scanUrl(String gameUrl) throws ChessHogScannerException {
-        return null;
-    }
 
     private String decodeUrlByRegex(String url, String regex) throws ChessHogScannerException {
         String encodeString = match(
                 url,
                 regex
         );
+        if (encodeString == null) {
+            throw new ChessHogScannerException(
+                    String.format("Failed to decode source data for requesting URL: %s", url)
+            );
+        }
         return new String(
-                Base64.getDecoder().decode(encodeString)
+                Base64.getDecoder().decode(
+                        encodeString
+                )
         );
     }
 
index 7658ba1..28493cf 100644 (file)
@@ -47,6 +47,7 @@ public class ChessGamesScanner extends AbstractSettingsScanner {
 
     @Override
     public PGNTournament findTournament(String tournamentName) throws ChessHogScannerException {
+        PGNTournament tournament = null;
         Map<String, String> result = matchMap(
                 assignUrl(
                         tournamentName, true
@@ -54,7 +55,6 @@ public class ChessGamesScanner extends AbstractSettingsScanner {
                 getSettings().getTournamentQueryUrlRegex(),
                 true
         );
-        PGNTournament tournament = null;
         for (Map.Entry<String, String> entry : result.entrySet()) {
             if (entry.getValue().contains(tournamentName)) { // todo: contains?
                 tournament = new PGNTournament(
@@ -74,6 +74,11 @@ public class ChessGamesScanner extends AbstractSettingsScanner {
         String pgn = request(
                 assignUrl(gameId)
         );
+        if (!isPgnFormat(pgn)) {
+            throw new ChessHogScannerException(
+                    String.format("Failed to get PGN for requesting game ID: %s", gameId)
+            );
+        }
         return new PGNGame(
                 gameId, pgn
         );
@@ -86,7 +91,21 @@ public class ChessGamesScanner extends AbstractSettingsScanner {
 
     @Override
     public PGNGame scanUrl(String gameUrl) throws ChessHogScannerException {
-        return null;
+        String pgn = regex(
+                request(
+                        gameUrl
+                ),
+                getSettings().getGameUrlRegex()
+        );
+        if (pgn == null || !isPgnFormat(pgn)) {
+            throw new ChessHogScannerException(
+                    String.format("Failed to get PGN for requesting URL: %s", gameUrl)
+            );
+        }
+        return new PGNGame(
+                null,
+                pgn
+        );
     }
 
     private void assignTournamentGames(PGNTournament tournament) throws ChessHogScannerException {
index 134180e..18497a1 100644 (file)
 package org.hedgecode.chess.scanner.portal;
 
 import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+
+import org.apache.commons.text.StringEscapeUtils;
 
 import org.hedgecode.chess.scanner.ChessHogScannerException;
 import org.hedgecode.chess.scanner.entity.PGNGame;
 import org.hedgecode.chess.scanner.entity.PGNTournament;
+import org.hedgecode.chess.scanner.format.lichess.Format;
+import org.hedgecode.chess.scanner.format.lichess.GameFormat;
+
+import static org.hedgecode.chess.scanner.ChessHogScannerConstants.*;
 
 /**
  * LiChessScanner
@@ -33,10 +37,6 @@ public class LiChessScanner extends AbstractSettingsScanner {
 
     private static final String SETTINGS_FILENAME = "lichess.settings";
 
-    private static final String TOURNAMENT_GAMES_SPLIT_REGEX = "\\[Event \"[^\"]+\"\\]";
-    private static final String TOURNAMENT_NAME_REGEX = "\\[Event \"([^\"]+)\"\\]";
-    private static final String GAME_ID_REGEX = "\\[Site \"https://lichess.org/([^\"]+)\"\\]";
-
     @Override
     protected String getResourceName() {
         return SETTINGS_FILENAME;
@@ -61,6 +61,11 @@ public class LiChessScanner extends AbstractSettingsScanner {
         String pgn = request(
                 assignUrl(gameId)
         );
+        if (!isPgnFormat(pgn)) {
+            throw new ChessHogScannerException(
+                    String.format("Failed to get PGN for requesting game ID: %s", gameId)
+            );
+        }
         return new PGNGame(
                 gameId, pgn
         );
@@ -73,7 +78,38 @@ public class LiChessScanner extends AbstractSettingsScanner {
 
     @Override
     public PGNGame scanUrl(String gameUrl) throws ChessHogScannerException {
-        return null;
+        String gamePage = request(gameUrl);
+        String pgn = regex(
+                gamePage,
+                getSettings().getGameUrlRegex()
+        );
+        if (pgn == null) {
+            pgn = regex(
+                    gamePage,
+                    getSettings().getGameJsonUrlRegex()
+            );
+            if (pgn == null) {
+                throw new ChessHogScannerException(
+                        String.format("Failed to get source data for requesting URL: %s", gameUrl)
+                );
+            }
+            GameFormat gameFormat = Format.formatGame(pgn);
+            return new PGNGame(
+                    gameFormat.id(),
+                    gameFormat.pgn()
+            );
+        } else {
+            pgn = StringEscapeUtils.unescapeHtml4(pgn);
+            if (!isPgnFormat(pgn)) {
+                throw new ChessHogScannerException(
+                        String.format("Failed to get PGN for requesting URL: %s", gameUrl)
+                );
+            }
+            return new PGNGame(
+                    regex(pgn, getSettings().getGameIdRegex()),
+                    pgn
+            );
+        }
     }
 
     private void assignTournamentGames(PGNTournament tournament) throws ChessHogScannerException {
@@ -83,30 +119,27 @@ public class LiChessScanner extends AbstractSettingsScanner {
                         tournament.id(),
                         null
                 ),
-                TOURNAMENT_GAMES_SPLIT_REGEX
+                PGN_DETECT_REGEX
         );
 
         if (!pgnGames.isEmpty()) {
             tournament.setName(
-                    find(TOURNAMENT_NAME_REGEX, pgnGames.get(0))
+                    regex(
+                            pgnGames.get(0),
+                            getSettings().getTournamentNameRegex()
+                    )
             );
         }
 
         for (String pgn : pgnGames) {
-            String gameId = find(GAME_ID_REGEX, pgn);
+            String gameId = regex(
+                    pgn,
+                    getSettings().getGameIdRegex()
+            );
             tournament.addGame(
                     new PGNGame(gameId, pgn)
             );
         }
     }
 
-    private String find(String regex, String pgn) {
-        Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);
-        Matcher matcher = pattern.matcher(pgn);
-        if (matcher.find()) {
-            return matcher.group(1);
-        }
-        return null;
-    }
-
 }
index d6e3b65..da75990 100644 (file)
@@ -1,9 +1,9 @@
 {
   "tournamentUrl": "https://www.chessbomb.com/arena/[tournamentId]",
   "tournamentIsMultiPage": false,
-  "tournamentGamesUrlRegex": "cbConfigData=\"([^\"]+)\"",
+  "tournamentJsonUrlRegex": "cbConfigData=\"([^\"]+)\"",
   "tournamentQuery": "https://www.chessbomb.com/arena/",
   "tournamentQueryUrlRegex": "cbConfigData=\"([^\"]+)\"",
   "gameUrl": "https://www.chessbomb.com/arena/[tournamentId]/[gameId]",
-  "gameUrlRegex": "cbConfigData=\"([^\"]+)\""
+  "gameJsonUrlRegex": "cbConfigData=\"([^\"]+)\""
 }
\ No newline at end of file
index 452a981..bae5460 100644 (file)
@@ -5,5 +5,6 @@
   "tournamentQuery": "https://www.chessgames.com/perl/tournaments?query=[query]",
   "tournamentQueryUrlRegex": "<a href=\"/perl/chess.pl\\?tid=([0-9]+)\">([^<]+)</a>",
   "gameUrl": "https://www.chessgames.com/perl/chessgame?gid=[gameId]",
-  "gamePgnUrl": "https://www.chessgames.com/perl/nph-chesspgn?gid=[gameId]&text=1"
+  "gamePgnUrl": "https://www.chessgames.com/perl/nph-chesspgn?gid=[gameId]&text=1",
+  "gameUrlRegex": "pgn='([^']+)'"
 }
\ No newline at end of file
index 4f44ba1..5a542fd 100644 (file)
@@ -2,7 +2,11 @@
   "tournamentUrl": "https://lichess.org/api/tournament/[tournamentId]/games",
   "tournamentIsMultiPage": false,
   "tournamentQueryParams": "?clocks=false&evals=false&opening=true",
+  "tournamentNameRegex": "\\[Event \"([^\"]+)\"\\]",
   "gameUrl": "https://lichess.org/[gameId]",
   "gamePgnUrl": "https://lichess.org/game/export/[gameId]",
-  "gameQueryParams": "?clocks=false&evals=false&literate=true"
+  "gameQueryParams": "?clocks=false&evals=false&literate=true",
+  "gameUrlRegex": "<div class=\"pgn\">([^<]+)</div>",
+  "gameJsonUrlRegex": "<script[^>]+>.*=(\\{.*\"data\":\\{\"game\"[^<]+})</script>",
+  "gameIdRegex" : "\\[Site \"https://lichess.org/([^\"]+)\"\\]"
 }
\ No newline at end of file