From: gotty Date: Fri, 17 Jan 2020 23:51:29 +0000 (+0300) Subject: [LIB-13] Method scanUrl implementation for several scanners X-Git-Url: https://git.hedgecode.org/?a=commitdiff_plain;h=cbcefd2f85fa56b769bc211a134c7df1e97ef0be;p=chesshog-scanner.git [LIB-13] Method scanUrl implementation for several scanners --- diff --git a/pom.xml b/pom.xml index fb01884..08b52bd 100644 --- a/pom.xml +++ b/pom.xml @@ -64,6 +64,7 @@ 0.1-SNAPSHOT 4.4.11 4.5.9 + 1.8 2.8.0 4.12 1.10 @@ -86,6 +87,11 @@ ${httpClientVersion} + org.apache.commons + commons-text + ${commonsTextVersion} + + com.google.code.gson gson ${gsonVersion} diff --git a/src/main/java/org/hedgecode/chess/scanner/ChessHogScannerConstants.java b/src/main/java/org/hedgecode/chess/scanner/ChessHogScannerConstants.java index 50eb391..fba0733 100644 --- a/src/main/java/org/hedgecode/chess/scanner/ChessHogScannerConstants.java +++ b/src/main/java/org/hedgecode/chess/scanner/ChessHogScannerConstants.java @@ -42,6 +42,8 @@ public final class ChessHogScannerConstants { public static final String DOMAIN_CHESS24 = "chess24.com"; public static final String DOMAIN_CHESSCOM = "chess.com"; + public static final String PGN_DETECT_REGEX = "^\\[Event \"[^\"]+\"\\]$"; + public static final String PROXY_UNDEFINED = "undefined"; public static final String PROXY_HTTP = "http"; public static final String PROXY_SOCKS = "socks"; diff --git a/src/main/java/org/hedgecode/chess/scanner/Settings.java b/src/main/java/org/hedgecode/chess/scanner/Settings.java index 6a14f9b..27be0fc 100644 --- a/src/main/java/org/hedgecode/chess/scanner/Settings.java +++ b/src/main/java/org/hedgecode/chess/scanner/Settings.java @@ -31,6 +31,10 @@ public interface Settings { String getTournamentGamesUrlRegex(); + String getTournamentJsonUrlRegex(); + + String getTournamentNameRegex(); + String getTournamentQuery(); String getTournamentQueryUrlRegex(); @@ -43,4 +47,8 @@ public interface Settings { String getGameUrlRegex(); + String getGameJsonUrlRegex(); + + String getGameIdRegex(); + } diff --git a/src/main/java/org/hedgecode/chess/scanner/json/JSONSettings.java b/src/main/java/org/hedgecode/chess/scanner/json/JSONSettings.java index 474acbe..826b343 100644 --- a/src/main/java/org/hedgecode/chess/scanner/json/JSONSettings.java +++ b/src/main/java/org/hedgecode/chess/scanner/json/JSONSettings.java @@ -39,6 +39,12 @@ public class JSONSettings implements Settings { @SerializedName("tournamentGamesUrlRegex") private String tournamentGamesUrlRegex; + @SerializedName("tournamentJsonUrlRegex") + private String tournamentJsonUrlRegex; + + @SerializedName("tournamentNameRegex") + private String tournamentNameRegex; + @SerializedName("tournamentQuery") private String tournamentQuery; @@ -57,6 +63,12 @@ public class JSONSettings implements Settings { @SerializedName("gameUrlRegex") private String gameUrlRegex; + @SerializedName("gameJsonUrlRegex") + private String gameJsonUrlRegex; + + @SerializedName("gameIdRegex") + private String gameIdRegex; + @Override public String getTournamentUrl() { return tournamentUrl; @@ -78,6 +90,16 @@ public class JSONSettings implements Settings { } @Override + public String getTournamentJsonUrlRegex() { + return tournamentJsonUrlRegex; + } + + @Override + public String getTournamentNameRegex() { + return tournamentNameRegex; + } + + @Override public String getTournamentQuery() { return tournamentQuery; } @@ -107,4 +129,14 @@ public class JSONSettings implements Settings { return gameUrlRegex; } + @Override + public String getGameJsonUrlRegex() { + return gameJsonUrlRegex; + } + + @Override + public String getGameIdRegex() { + return gameIdRegex; + } + } diff --git a/src/main/java/org/hedgecode/chess/scanner/portal/AbstractSettingsScanner.java b/src/main/java/org/hedgecode/chess/scanner/portal/AbstractSettingsScanner.java index 0910047..445b603 100644 --- a/src/main/java/org/hedgecode/chess/scanner/portal/AbstractSettingsScanner.java +++ b/src/main/java/org/hedgecode/chess/scanner/portal/AbstractSettingsScanner.java @@ -16,6 +16,9 @@ package org.hedgecode.chess.scanner.portal; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + import org.hedgecode.chess.scanner.ChessHogScannerException; import org.hedgecode.chess.scanner.Initiable; import org.hedgecode.chess.scanner.Settings; @@ -24,6 +27,7 @@ import org.hedgecode.chess.scanner.regex.RegexBuilder; import org.hedgecode.chess.scanner.regex.RegexParams; import org.hedgecode.chess.scanner.spi.ServiceRegistry; +import static org.hedgecode.chess.scanner.ChessHogScannerConstants.*; import static org.hedgecode.chess.scanner.regex.RegexBuilder.Type; /** @@ -102,6 +106,18 @@ public abstract class AbstractSettingsScanner extends AbstractRequestScanner imp ); } + protected String regex(String source, String regex) { + Matcher matcher = Pattern.compile(regex, Pattern.MULTILINE).matcher(source); + if (matcher.find()) { + return matcher.groupCount() > 0 ? matcher.group(1) : matcher.group(); + } + return null; + } + + protected boolean isPgnFormat(String source) { + return regex(source, PGN_DETECT_REGEX) != null; + } + private String assignUrlWithParams(String url, String params) { return params != null ? url.concat(params) diff --git a/src/main/java/org/hedgecode/chess/scanner/portal/ChessBombScanner.java b/src/main/java/org/hedgecode/chess/scanner/portal/ChessBombScanner.java index 7b19256..9084f5a 100644 --- a/src/main/java/org/hedgecode/chess/scanner/portal/ChessBombScanner.java +++ b/src/main/java/org/hedgecode/chess/scanner/portal/ChessBombScanner.java @@ -44,7 +44,7 @@ public class ChessBombScanner extends AbstractSettingsScanner { public PGNTournament scanTournament(String tournamentId) throws ChessHogScannerException { String decodeTournament = decodeUrlByRegex( assignUrl(tournamentId, null), - getSettings().getTournamentGamesUrlRegex() + getSettings().getTournamentJsonUrlRegex() ); TournamentFormat tournamentFormat = Format.formatTournament(decodeTournament); @@ -87,9 +87,24 @@ public class ChessBombScanner extends AbstractSettingsScanner { @Override public PGNGame scanGame(String gameId, String tournamentId) throws ChessHogScannerException { - String decodeGame = decodeUrlByRegex( + return scanGameByRegex( assignUrl(gameId, tournamentId, true), - getSettings().getGameUrlRegex() + getSettings().getGameJsonUrlRegex() + ); + } + + @Override + public PGNGame scanUrl(String gameUrl) throws ChessHogScannerException { + return scanGameByRegex( + gameUrl, + getSettings().getGameJsonUrlRegex() + ); + } + + private PGNGame scanGameByRegex(String gameUrl, String regex) throws ChessHogScannerException { + String decodeGame = decodeUrlByRegex( + gameUrl, + regex ); GameFormat gameFormat = Format.formatGame(decodeGame); @@ -100,18 +115,21 @@ public class ChessBombScanner extends AbstractSettingsScanner { ); } - @Override - public PGNGame scanUrl(String gameUrl) throws ChessHogScannerException { - return null; - } private String decodeUrlByRegex(String url, String regex) throws ChessHogScannerException { String encodeString = match( url, regex ); + if (encodeString == null) { + throw new ChessHogScannerException( + String.format("Failed to decode source data for requesting URL: %s", url) + ); + } return new String( - Base64.getDecoder().decode(encodeString) + Base64.getDecoder().decode( + encodeString + ) ); } diff --git a/src/main/java/org/hedgecode/chess/scanner/portal/ChessGamesScanner.java b/src/main/java/org/hedgecode/chess/scanner/portal/ChessGamesScanner.java index 7658ba1..28493cf 100644 --- a/src/main/java/org/hedgecode/chess/scanner/portal/ChessGamesScanner.java +++ b/src/main/java/org/hedgecode/chess/scanner/portal/ChessGamesScanner.java @@ -47,6 +47,7 @@ public class ChessGamesScanner extends AbstractSettingsScanner { @Override public PGNTournament findTournament(String tournamentName) throws ChessHogScannerException { + PGNTournament tournament = null; Map result = matchMap( assignUrl( tournamentName, true @@ -54,7 +55,6 @@ public class ChessGamesScanner extends AbstractSettingsScanner { getSettings().getTournamentQueryUrlRegex(), true ); - PGNTournament tournament = null; for (Map.Entry entry : result.entrySet()) { if (entry.getValue().contains(tournamentName)) { // todo: contains? tournament = new PGNTournament( @@ -74,6 +74,11 @@ public class ChessGamesScanner extends AbstractSettingsScanner { String pgn = request( assignUrl(gameId) ); + if (!isPgnFormat(pgn)) { + throw new ChessHogScannerException( + String.format("Failed to get PGN for requesting game ID: %s", gameId) + ); + } return new PGNGame( gameId, pgn ); @@ -86,7 +91,21 @@ public class ChessGamesScanner extends AbstractSettingsScanner { @Override public PGNGame scanUrl(String gameUrl) throws ChessHogScannerException { - return null; + String pgn = regex( + request( + gameUrl + ), + getSettings().getGameUrlRegex() + ); + if (pgn == null || !isPgnFormat(pgn)) { + throw new ChessHogScannerException( + String.format("Failed to get PGN for requesting URL: %s", gameUrl) + ); + } + return new PGNGame( + null, + pgn + ); } private void assignTournamentGames(PGNTournament tournament) throws ChessHogScannerException { diff --git a/src/main/java/org/hedgecode/chess/scanner/portal/LiChessScanner.java b/src/main/java/org/hedgecode/chess/scanner/portal/LiChessScanner.java index 134180e..18497a1 100644 --- a/src/main/java/org/hedgecode/chess/scanner/portal/LiChessScanner.java +++ b/src/main/java/org/hedgecode/chess/scanner/portal/LiChessScanner.java @@ -17,12 +17,16 @@ package org.hedgecode.chess.scanner.portal; import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; + +import org.apache.commons.text.StringEscapeUtils; import org.hedgecode.chess.scanner.ChessHogScannerException; import org.hedgecode.chess.scanner.entity.PGNGame; import org.hedgecode.chess.scanner.entity.PGNTournament; +import org.hedgecode.chess.scanner.format.lichess.Format; +import org.hedgecode.chess.scanner.format.lichess.GameFormat; + +import static org.hedgecode.chess.scanner.ChessHogScannerConstants.*; /** * LiChessScanner @@ -33,10 +37,6 @@ public class LiChessScanner extends AbstractSettingsScanner { private static final String SETTINGS_FILENAME = "lichess.settings"; - private static final String TOURNAMENT_GAMES_SPLIT_REGEX = "\\[Event \"[^\"]+\"\\]"; - private static final String TOURNAMENT_NAME_REGEX = "\\[Event \"([^\"]+)\"\\]"; - private static final String GAME_ID_REGEX = "\\[Site \"https://lichess.org/([^\"]+)\"\\]"; - @Override protected String getResourceName() { return SETTINGS_FILENAME; @@ -61,6 +61,11 @@ public class LiChessScanner extends AbstractSettingsScanner { String pgn = request( assignUrl(gameId) ); + if (!isPgnFormat(pgn)) { + throw new ChessHogScannerException( + String.format("Failed to get PGN for requesting game ID: %s", gameId) + ); + } return new PGNGame( gameId, pgn ); @@ -73,7 +78,38 @@ public class LiChessScanner extends AbstractSettingsScanner { @Override public PGNGame scanUrl(String gameUrl) throws ChessHogScannerException { - return null; + String gamePage = request(gameUrl); + String pgn = regex( + gamePage, + getSettings().getGameUrlRegex() + ); + if (pgn == null) { + pgn = regex( + gamePage, + getSettings().getGameJsonUrlRegex() + ); + if (pgn == null) { + throw new ChessHogScannerException( + String.format("Failed to get source data for requesting URL: %s", gameUrl) + ); + } + GameFormat gameFormat = Format.formatGame(pgn); + return new PGNGame( + gameFormat.id(), + gameFormat.pgn() + ); + } else { + pgn = StringEscapeUtils.unescapeHtml4(pgn); + if (!isPgnFormat(pgn)) { + throw new ChessHogScannerException( + String.format("Failed to get PGN for requesting URL: %s", gameUrl) + ); + } + return new PGNGame( + regex(pgn, getSettings().getGameIdRegex()), + pgn + ); + } } private void assignTournamentGames(PGNTournament tournament) throws ChessHogScannerException { @@ -83,30 +119,27 @@ public class LiChessScanner extends AbstractSettingsScanner { tournament.id(), null ), - TOURNAMENT_GAMES_SPLIT_REGEX + PGN_DETECT_REGEX ); if (!pgnGames.isEmpty()) { tournament.setName( - find(TOURNAMENT_NAME_REGEX, pgnGames.get(0)) + regex( + pgnGames.get(0), + getSettings().getTournamentNameRegex() + ) ); } for (String pgn : pgnGames) { - String gameId = find(GAME_ID_REGEX, pgn); + String gameId = regex( + pgn, + getSettings().getGameIdRegex() + ); tournament.addGame( new PGNGame(gameId, pgn) ); } } - private String find(String regex, String pgn) { - Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE); - Matcher matcher = pattern.matcher(pgn); - if (matcher.find()) { - return matcher.group(1); - } - return null; - } - } diff --git a/src/main/resources/settings/chessbomb.settings b/src/main/resources/settings/chessbomb.settings index d6e3b65..da75990 100644 --- a/src/main/resources/settings/chessbomb.settings +++ b/src/main/resources/settings/chessbomb.settings @@ -1,9 +1,9 @@ { "tournamentUrl": "https://www.chessbomb.com/arena/[tournamentId]", "tournamentIsMultiPage": false, - "tournamentGamesUrlRegex": "cbConfigData=\"([^\"]+)\"", + "tournamentJsonUrlRegex": "cbConfigData=\"([^\"]+)\"", "tournamentQuery": "https://www.chessbomb.com/arena/", "tournamentQueryUrlRegex": "cbConfigData=\"([^\"]+)\"", "gameUrl": "https://www.chessbomb.com/arena/[tournamentId]/[gameId]", - "gameUrlRegex": "cbConfigData=\"([^\"]+)\"" + "gameJsonUrlRegex": "cbConfigData=\"([^\"]+)\"" } \ No newline at end of file diff --git a/src/main/resources/settings/chessgames.settings b/src/main/resources/settings/chessgames.settings index 452a981..bae5460 100644 --- a/src/main/resources/settings/chessgames.settings +++ b/src/main/resources/settings/chessgames.settings @@ -5,5 +5,6 @@ "tournamentQuery": "https://www.chessgames.com/perl/tournaments?query=[query]", "tournamentQueryUrlRegex": "([^<]+)", "gameUrl": "https://www.chessgames.com/perl/chessgame?gid=[gameId]", - "gamePgnUrl": "https://www.chessgames.com/perl/nph-chesspgn?gid=[gameId]&text=1" + "gamePgnUrl": "https://www.chessgames.com/perl/nph-chesspgn?gid=[gameId]&text=1", + "gameUrlRegex": "pgn='([^']+)'" } \ No newline at end of file diff --git a/src/main/resources/settings/lichess.settings b/src/main/resources/settings/lichess.settings index 4f44ba1..5a542fd 100644 --- a/src/main/resources/settings/lichess.settings +++ b/src/main/resources/settings/lichess.settings @@ -2,7 +2,11 @@ "tournamentUrl": "https://lichess.org/api/tournament/[tournamentId]/games", "tournamentIsMultiPage": false, "tournamentQueryParams": "?clocks=false&evals=false&opening=true", + "tournamentNameRegex": "\\[Event \"([^\"]+)\"\\]", "gameUrl": "https://lichess.org/[gameId]", "gamePgnUrl": "https://lichess.org/game/export/[gameId]", - "gameQueryParams": "?clocks=false&evals=false&literate=true" + "gameQueryParams": "?clocks=false&evals=false&literate=true", + "gameUrlRegex": "
([^<]+)
", + "gameJsonUrlRegex": "]+>.*=(\\{.*\"data\":\\{\"game\"[^<]+})", + "gameIdRegex" : "\\[Site \"https://lichess.org/([^\"]+)\"\\]" } \ No newline at end of file