/* * Cantata * * Copyright (c) 2011-2017 Craig Drummond * * ---- * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. */ #include "wikipediaengine.h" #include "network/networkaccessmanager.h" #include "gui/settings.h" #include "gui/covers.h" #include "config.h" #include #include #include #include static bool debugEnabled=false; #define DBUG if (debugEnabled) qWarning() << metaObject()->className() << __FUNCTION__ void WikipediaEngine::enableDebug() { debugEnabled=true; } static const char * constModeProperty="mode"; static const char * constQueryProperty="query"; static QString wikipediaSpecialExport(const QString &lang) { static QMap links; if (links.isEmpty()) { links.insert(QLatin1String("de"), QString("Spezial:Exportieren")); links.insert(QLatin1String("ru"), QString("Служебная:Экспорт")); } QMap::ConstIterator it=links.find(lang); return "/"+(it==links.constEnd() ? QLatin1String("Special:Export") : it.value())+"/"; } static QString fixWikiLink(const QUrl &url) { QString lang=url.host().split(".").first(); QString path=url.path(); QString fixed(path); fixed=fixed.replace("/wiki"+wikipediaSpecialExport(lang), "/wiki/"); if (path==fixed) { QStringList parts=fixed.split("/", QString::SkipEmptyParts); if (parts.length()>1) { parts.removeAt(1); fixed=parts.join("/"); } } QUrl u(url); u.setPath(fixed); return u.toString(); } static QString strip(const QString &string, QString open, QString close, QString inner=QString()) { QString result; int next, /*lastLeft, */left = 0; int pos = string.indexOf(open, 0, Qt::CaseInsensitive); if (pos < 0) { return string; } if (inner.isEmpty()) { inner = open; } while (pos > -1) { result += string.mid(left, pos - left); // lastLeft = left; left = string.indexOf(close, pos); if (left < 0) { // opens, but doesn't close break; } else { next = pos; while (next > -1 && left > -1) { // search for inner iterations int count = 0; int lastNext = next; while ((next = string.indexOf(inner, next+inner.length())) < left && next > -1) { // count inner section openers lastNext = next; ++count; } next = lastNext; // set back next to last inside opener for next iteration if (!count) { // no inner sections, skip break; } for (int i = 0; i < count; ++i) { // shift section closers by inside section amount left = string.indexOf(close, left+close.length()); } // "continue" - search for next inner section } if (left < 0) { // section does not close, skip here break; } left += close.length(); // extend close to next search start } if (left < 0) { // section does not close, skip here break; } pos = string.indexOf(open, left); // search next 1st level section opener } if (left > -1) { // append last part result += string.mid(left); } return result; } static QString stripEmptySections(QString answer) { QStringList headers=QStringList() << "h3" << "h2" << "b"; foreach (const QString &h1, headers) { foreach (const QString &h2, headers) { int end=-1; do { end=answer.indexOf("<"+h2+">"); int realEnd=end+3+h1.length(); if (-1==end) { end=answer.indexOf("

<"+h2+">"); realEnd=end+11+h1.length(); } if (-1!=end) { int start=answer.lastIndexOf("<"+h1+">", end); if (-1!=start) { answer=answer.left(start)+answer.mid(realEnd); } } } while(-1!=end); } } return answer; } static QString stripLastEmptySection(QString answer) { QStringList headers=QStringList() << "h3" << "h2" << "b"; bool modified=false; do { modified=false; foreach (const QString &h, headers) { if (answer.endsWith("") || answer.endsWith(" ") || answer.endsWith(" ")) { int start=answer.lastIndexOf("<"+h+">", answer.length()-4); if (-1!=start) { answer=answer.left(start); modified=true; } } } } while (modified); return answer; } static QString wikiToHtml(QString answer, bool introOnly, const QUrl &url) { QString u=fixWikiLink(url); int start = answer.indexOf('>', answer.indexOf(" start && e < end) { end = e; } e = answer.lastIndexOf(QRegExp("\n==\\s*Sources\\s*==")); if (e > start && e < end) { end = e; } e = answer.lastIndexOf(QRegExp("\n==\\s*Notes\\s*==")); if (e > start && e < end) { end = e; } e = answer.lastIndexOf(QRegExp("\n==\\s*References\\s*==")); if (e > start && e < end) { end = e; } e = answer.lastIndexOf(QRegExp("\n==\\s*External links\\s*==")); if (e > start && e < end) { end = e; } if (end < start) { end = answer.lastIndexOf(""); answer = strip(answer, ""); // strip comments answer.remove(QRegExp("]*/>")); // strip inline refereces answer = strip(answer, "", "", ""); answer.replace("\n\n", "
"); answer.replace("( ; ", "("); // answer.replace("\n\n", "

"); answer.replace(QRegExp("\\n'''([^\\n]*)'''\\n"), "


\\1\n"); answer.replace(QRegExp("\\n\\{\\|[^\\n]*\\n"), "\n"); answer.replace(QRegExp("\\n\\|[^\\n]*\\n"), "\n"); answer.replace("\n*", "
"); answer.replace("\n", ""); answer.replace("'''s ", "'s"); answer.replace("'''", "¬").replace(QRegExp("¬([^¬]*)¬"), "\\1"); answer.replace("''", "¬").replace(QRegExp("¬([^¬]*)¬"), "\\1"); if (!introOnly) { answer.replace("===", "¬").replace(QRegExp("¬([^¬]*)¬"), "

\\1

"); answer.replace("==", "¬").replace(QRegExp("¬([^¬]*)¬"), "

\\1

"); } answer.replace("&nbsp;", " "); answer.replace("–", "-"); answer.replace("
")) { answer+="
"; } answer+=QString("
%2").arg(u).arg(WikipediaEngine::constReadMorePlaceholder); } } else { answer.replace("
", ""); answer.replace("
", ""); answer.replace("

=", "

"); answer.replace("

=", ""); answer.replace("br>;", "br>"); answer.replace("h2>;", "h2>"); answer.replace("h3>;", "h3>"); answer.replace("




", "

"); answer.replace("



", "

"); answer.replace("


", "

"); // Remove track listings - we take these from MPD... QString listingText="

"+QObject::tr("Track listing")+"

"; start=answer.indexOf(listingText, 0, Qt::CaseInsensitive); if (-1!=start) { int end=answer.indexOf("

", start+listingText.length(), Qt::CaseInsensitive); if (start!=end) { answer=answer.left(start)+answer.mid(end); } } // Try to remove empty sections (that will have been reated because we removed tables!) answer=stripEmptySections(answer); answer=stripLastEmptySection(answer); } if (!introOnly) { if (!answer.endsWith("
")) { answer+="
"; } answer+=QString("
%2").arg(u).arg(WikipediaEngine::constOpenInBrowserPlaceholder); } return answer; } static inline QString getLang(const QUrl &url) { return url.host().remove(QLatin1String(".wikipedia.org")); } QStringList WikipediaEngine::preferredLangs; bool WikipediaEngine::introOnly=true; const QLatin1String WikipediaEngine::constReadMorePlaceholder("XXX_CONTEXT_READ_MORE_ON_WIKIPEDIA_XXX"); const QLatin1String WikipediaEngine::constOpenInBrowserPlaceholder("XXX_CONTEXT_OPEN_IN_BROWSER_WIKIPEDIA_XXX"); WikipediaEngine::WikipediaEngine(QObject *p) : ContextEngine(p) { if (preferredLangs.isEmpty()) { setPreferedLangs(Settings::self()->wikipediaLangs()); introOnly=Settings::self()->wikipediaIntroOnly(); } } void WikipediaEngine::setPreferedLangs(const QStringList &l) { preferredLangs=l; if (preferredLangs.isEmpty()) { preferredLangs.append("en"); } } QString WikipediaEngine::translateLinks(QString text) const { text=text.replace(constReadMorePlaceholder, QObject::tr("Read more on wikipedia")); text=text.replace(constOpenInBrowserPlaceholder, QObject::tr("Open in browser")); return text; } void WikipediaEngine::search(const QStringList &query, Mode mode) { titles.clear(); // if (Track==mode) { // emit searchResult(QString(), QString()); // return; // } requestTitles(fixQuery(query), mode, getPrefix(preferredLangs.first())); } void WikipediaEngine::requestTitles(const QStringList &query, Mode mode, const QString &lang) { cancel(); QUrl url("https://"+lang+".wikipedia.org/w/api.php"); QUrlQuery q; q.addQueryItem(QLatin1String("action"), QLatin1String("query")); q.addQueryItem(QLatin1String("list"), QLatin1String("search")); q.addQueryItem(QLatin1String("srsearch"), query.join(" ")); q.addQueryItem(QLatin1String("srprop"), QLatin1String("size")); q.addQueryItem(QLatin1String("srredirects"), QString::number(1)); q.addQueryItem(QLatin1String("srlimit"), QString::number(20)); q.addQueryItem(QLatin1String("format"), QLatin1String("xml")); url.setQuery(q); job=NetworkAccessManager::self()->get(url); job->setProperty(constModeProperty, (int)mode); job->setProperty(constQueryProperty, query); DBUG << url.toString(); connect(job, SIGNAL(finished()), this, SLOT(parseTitles())); } void WikipediaEngine::parseTitles() { DBUG << __FUNCTION__; NetworkJob *reply = getReply(sender()); if (!reply) { return; } QUrl url=reply->url(); QString hostLang=getLang(url); QByteArray data=reply->readAll(); if (!reply->ok() || data.isEmpty()) { DBUG << reply->errorString(); emit searchResult(QString(), QString()); return; } QStringList query = reply->property(constQueryProperty).toStringList(); Mode mode=(Mode)reply->property(constModeProperty).toInt(); QXmlStreamReader xml(data); while (!xml.atEnd() && !xml.hasError()) { xml.readNext(); if (xml.isStartElement() && QLatin1String("search")==xml.name()) { while (xml.readNextStartElement()) { if (QLatin1String("p")==xml.name()) { if (xml.attributes().hasAttribute(QLatin1String("title"))) { titles << xml.attributes().value(QLatin1String("title")).toString(); } xml.skipCurrentElement(); } else { xml.skipCurrentElement(); } } } } if (titles.isEmpty()) { DBUG << "No titles"; QRegExp regex(QLatin1Char('^') + hostLang + QLatin1String(".*$")); int index = preferredLangs.indexOf(regex); if (-1!=index && index < preferredLangs.count()-1) { // use next preferred language as base for fetching langlinks since // the current one did not get any results we want. requestTitles(query, mode, getPrefix(preferredLangs.value(index+1))); } else { DBUG << "No more langs"; emit searchResult(QString(), QString()); } return; } DBUG << titles; getPage(query, mode, hostLang); } static int indexOf(const QStringList &l, const QString &s) { QString search=s.simplified(); for (int i=0; i replacements; replacements.insert(QLatin1String("."), QString()); // A.S.A.P. -> ASAP replacements.insert(QLatin1String("-"), QLatin1String("/")); // AC-DC -> AC/DC QMap::ConstIterator repEnd=replacements.constEnd(); while (!queryCopy.isEmpty()) { QString q=queryCopy.join(" "); queries.append(q); for (QMap::ConstIterator rep=replacements.constBegin(); rep!=repEnd; ++rep) { QString q2=q; q2.replace(rep.key(), rep.value()); if (q2!=q) { queries.append(q2); } } queryCopy.takeFirst(); } QStringList patterns; QStringList englishPatterns; switch (mode) { default: case Artist: patterns=tr("artist|band|singer|vocalist|musician", "Search pattern for an artist or band, separated by |").split("|", QString::SkipEmptyParts); englishPatterns=QString(QLatin1String("artist|band|singer|vocalist|musician")).split("|"); break; case Album: patterns=tr("album|score|soundtrack", "Search pattern for an album, separated by |").split("|", QString::SkipEmptyParts); englishPatterns=QString(QLatin1String("album|score|soundtrack")).split("|"); break; case Track: // patterns=trc("Search pattern for a song, separated by |", "song|track").split("|", QString::SkipEmptyParts); // englishPatterns=QString(QLatin1String("song|track")).split("|"); break; } foreach (const QString &eng, englishPatterns) { if (!patterns.contains(eng)) { patterns.append(eng); } } DBUG << "Titles" << titles; int index=-1; if ((mode==Album || mode==Track) && 2==query.count()) { DBUG << "Check track/album"; foreach (const QString &pattern, patterns) { QString q=query.at(1)+" ("+query.at(0)+" "+pattern+")"; DBUG << "Try" << q; index=indexOf(simplifiedTitles, q); if (-1!=index) { DBUG << "Matched with '$album/$track ($artist pattern)" << index << q; break; } } } if (-1==index) { foreach (const QString &q, queries) { DBUG << "Query" << q; // First check original query with one of the patterns... foreach (const QString &pattern, patterns) { index=indexOf(simplifiedTitles, q+" ("+pattern+")"); if (-1!=index) { DBUG << "Matched with pattern" << index << QString(q+" ("+pattern+")"); break; } } if (-1==index) { // Try without any pattern... index=indexOf(simplifiedTitles, q); if (-1!=index) { DBUG << "Matched without pattern" << index << q; } } if (-1!=index) { break; } } } // TODO: If we fail to find a match, prompt user??? if (-1==index) { DBUG << "Failed to find match"; emit searchResult(QString(), QString()); return; } const QString title=titles.takeAt(index); if (QLatin1String("List of CJK Unified Ideographs")==title) { DBUG << "Unicode list?"; emit searchResult(QString(), QString()); return; } QUrl url; url.setScheme(QLatin1String("https")); url.setHost(lang+".wikipedia.org"); url.setPath("/wiki"+wikipediaSpecialExport(lang)+title); job=NetworkAccessManager::self()->get(url); job->setProperty(constModeProperty, (int)mode); job->setProperty(constQueryProperty, query); DBUG << url.toString(); connect(job, SIGNAL(finished()), this, SLOT(parsePage())); } void WikipediaEngine::parsePage() { DBUG << __FUNCTION__; NetworkJob *reply = getReply(sender()); if (!reply) { return; } QByteArray data=reply->readAll(); if (!reply->ok() || data.isEmpty()) { DBUG << "Empty/error"; emit searchResult(QString(), QString()); return; } QString answer(QString::fromUtf8(data)); //DBUG << "Anser" << answer; QUrl url=reply->url(); QString hostLang=getLang(url); QStringList query=reply->property(constQueryProperty).toStringList(); Mode mode=(Mode)reply->property(constModeProperty).toInt(); if (answer.contains(QLatin1String("{{disambiguation}}")) || answer.contains(QLatin1String("{{disambig}}"))) { // tr??? getPage(query, mode, hostLang); return; } if (answer.isEmpty()) { emit searchResult(QString(), QString()); return; } QString resp=wikiToHtml(answer, introOnly, reply->url()); if (introOnly && resp.isEmpty()) { resp=wikiToHtml(answer, false, reply->url()); } // For track results, ensure response contains artist name! if (Track==mode && !resp.contains(query.at(0), Qt::CaseInsensitive) && !resp.contains(Covers::fixArtist(query.at(0)), Qt::CaseInsensitive)) { getPage(query, mode, hostLang); } else { emit searchResult(resp, hostLang); } }