/* * Cantata * * Copyright (c) 2011-2013 Craig Drummond * * ---- * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. */ #include "wikipediaengine.h" #include "networkaccessmanager.h" #include "localize.h" #include "settings.h" #include #if QT_VERSION >= 0x050000 #include #endif #include #include #include static const char * constModeProperty="mode"; static const char * constRedirectsProperty="redirects"; static const char * constQueryProperty="query"; static const int constMaxRedirects=3; static QString strip(const QString &string, QString open, QString close, QString inner=QString()) { QString result; int next, /*lastLeft, */left = 0; int pos = string.indexOf(open, 0, Qt::CaseInsensitive); if (pos < 0) { return string; } if (inner.isEmpty()) { inner = open; } while (pos > -1) { result += string.mid(left, pos - left); // lastLeft = left; left = string.indexOf(close, pos); if (left < 0) { // opens, but doesn't close break; } else { next = pos; while (next > -1 && left > -1) { // search for inner iterations int count = 0; int lastNext = next; while ((next = string.indexOf(inner, next+inner.length())) < left && next > -1) { // count inner section openers lastNext = next; ++count; } next = lastNext; // set back next to last inside opener for next iteration if (!count) { // no inner sections, skip break; } for (int i = 0; i < count; ++i) { // shift section closers by inside section amount left = string.indexOf(close, left+close.length()); } // "continue" - search for next inner section } if (left < 0) { // section does not close, skip here break; } left += close.length(); // extend close to next search start } if (left < 0) { // section does not close, skip here break; } pos = string.indexOf(open, left); // search next 1st level section opener } if (left > -1) { // append last part result += string.mid(left); } return result; } static QString wikiToHtml(QString answer) { int start = answer.indexOf('>', answer.indexOf(" start && e < end) { end = e; } e = answer.lastIndexOf(QRegExp("\n==\\s*Sources\\s*==")); if (e > start && e < end) { end = e; } e = answer.lastIndexOf(QRegExp("\n==\\s*Notes\\s*==")); if (e > start && e < end) { end = e; } e = answer.lastIndexOf(QRegExp("\n==\\s*References\\s*==")); if (e > start && e < end) { end = e; } e = answer.lastIndexOf(QRegExp("\n==\\s*External links\\s*==")); if (e > start && e < end) { end = e; } if (end < start) { end = answer.lastIndexOf(""); answer = strip(answer, ""); // strip comments answer.remove(QRegExp("]*/>")); // strip inline refereces answer = strip(answer, "", "", ""); answer.replace("\n\n", "
"); // answer.replace("\n\n", "

"); answer.replace(QRegExp("\\n'''([^\\n]*)'''\\n"), "


\\1\n"); answer.replace(QRegExp("\\n\\{\\|[^\\n]*\\n"), "\n"); answer.replace(QRegExp("\\n\\|[^\\n]*\\n"), "\n"); answer.replace("\n*", "
"); answer.replace("\n", ""); answer.replace("'''", "¬").replace(QRegExp("¬([^¬]*)¬"), "\\1"); answer.replace("''", "¬").replace(QRegExp("¬([^¬]*)¬"), "\\1"); answer.replace("===", "¬").replace(QRegExp("¬([^¬]*)¬"), "

\\1

"); answer.replace("==", "¬").replace(QRegExp("¬([^¬]*)¬"), "

\\1

"); answer.replace("&nbsp;", " "); answer.replace("

", ""); answer.replace("
", ""); answer.replace("

=", "

"); answer.replace("

=", ""); answer.replace("br>;", "br>"); answer.replace("h2>;", "h2>"); answer.replace("h3>;", "h3>"); // Remove track listings - we take these from MPD... QString listingText="

"+i18n("Track listing")+"

"; int listingStart=answer.indexOf(listingText, 0, Qt::CaseInsensitive); if (-1!=listingStart) { int listingEnd=answer.indexOf("

", listingStart+listingText.length(), Qt::CaseInsensitive); if (listingStart!=listingEnd) { answer=answer.left(listingStart)+answer.mid(listingEnd); } } return answer; } static inline QString getLang(const QUrl &url) { return url.host().remove(QLatin1String(".wikipedia.org")); } QStringList WikipediaEngine::preferredLangs; WikipediaEngine::WikipediaEngine(QObject *p) : ContextEngine(p) { if (preferredLangs.isEmpty()) { setPreferedLangs(Settings::self()->wikipediaLangs()); } } void WikipediaEngine::setPreferedLangs(const QStringList &l) { preferredLangs=l; if (preferredLangs.isEmpty()) { preferredLangs.append("en"); } } void WikipediaEngine::search(const QStringList &query, Mode mode) { titles.clear(); QStringList fixedQuery; foreach (QString q, query) { if (q.contains(QLatin1String("PREVIEW: buy it at www.magnatune.com"))) { q = q.remove(QLatin1String(" (PREVIEW: buy it at www.magnatune.com)")); int index = q.indexOf(QLatin1Char('-')); if (-1!=index) { q = q.left(index - 1); } } fixedQuery.append(q); } requestTitles(fixedQuery, mode, getPrefix(preferredLangs.first())); } void WikipediaEngine::requestTitles(const QStringList &query, Mode mode, const QString &lang) { cancel(); QUrl url("https://"+lang+".wikipedia.org/w/api.php"); #if QT_VERSION < 0x050000 QUrl &q=url; #else QUrlQuery q; #endif q.addQueryItem(QLatin1String("action"), QLatin1String("query")); q.addQueryItem(QLatin1String("list"), QLatin1String("search")); q.addQueryItem(QLatin1String("srsearch"), query.join(" ")); q.addQueryItem(QLatin1String("srprop"), QLatin1String("size")); q.addQueryItem(QLatin1String("srredirects"), QString::number(1)); q.addQueryItem(QLatin1String("srlimit"), QString::number(20)); q.addQueryItem(QLatin1String("format"), QLatin1String("xml")); #if QT_VERSION >= 0x050000 url.setQuery(q); #endif job=NetworkAccessManager::self()->get(url); job->setProperty(constModeProperty, (int)mode); job->setProperty(constRedirectsProperty, 0); job->setProperty(constQueryProperty, query); qWarning() << __FUNCTION__ << url.toString(); connect(job, SIGNAL(finished()), this, SLOT(parseTitles())); } void WikipediaEngine::parseTitles() { qWarning() << __FUNCTION__; QNetworkReply *reply = getReply(sender()); if (!reply) { return; } QUrl url=reply->url(); QString hostLang=getLang(url); QByteArray data=reply->readAll(); if (QNetworkReply::NoError!=reply->error() || data.isEmpty()) { qWarning() << __FUNCTION__ << reply->errorString(); emit searchResult(QString(), QString()); return; } QStringList query = reply->property(constQueryProperty).toStringList(); Mode mode=(Mode)reply->property(constModeProperty).toInt(); QXmlStreamReader xml(data); while (!xml.atEnd() && !xml.hasError()) { xml.readNext(); if (xml.isStartElement() && QLatin1String("search")==xml.name()) { while (xml.readNextStartElement()) { if (QLatin1String("p")==xml.name()) { if (xml.attributes().hasAttribute(QLatin1String("title"))) { titles << xml.attributes().value(QLatin1String("title")).toString(); } xml.skipCurrentElement(); } else { xml.skipCurrentElement(); } } } } if (titles.isEmpty()) { qWarning() << __FUNCTION__ << "No titles"; QRegExp regex(QLatin1Char('^') + hostLang + QLatin1String(".*$")); int index = preferredLangs.indexOf(regex); if (-1!=index && index < preferredLangs.count()-1) { // use next preferred language as base for fetching langlinks since // the current one did not get any results we want. requestTitles(query, mode, getPrefix(preferredLangs.value(index+1))); } else { qWarning() << __FUNCTION__ << "No more langs"; emit searchResult(QString(), QString()); } return; } getPage(query, mode, hostLang); } static int indexOf(const QStringList &l, const QString &s) { for (int i=0; i ASAP) QString query2=q; query2.remove("."); foreach (const QString &pattern, patterns) { index=indexOf(titles, query2+" ("+pattern+")"); if (-1!=index) { qWarning() << __FUNCTION__ << "Matched with pattern (no dots)" << index << q; break; } } } if (-1==index) { // Try without any pattern... index=indexOf(titles, q); if (-1!=index) { qWarning() << __FUNCTION__ << "Matched without pattern" << index << q; } } if (-1==index && q.contains(".")) { // Try without any pattern, and no dots.. QString query2=q; query2.remove("."); index=indexOf(titles, query2); if (-1!=index) { qWarning() << __FUNCTION__ << "Matched without pattern (no dots)" << index << q; } } if (-1!=index) { break; } } // TODO: If we fail to find a match, prompt user??? const QString title=titles.takeAt(-1==index ? 0 : index); if (QLatin1String("List of CJK Unified Ideographs")==title) { qWarning() << __FUNCTION__ << "Unicode list?"; emit searchResult(QString(), QString()); return; } QUrl url; url.setScheme(QLatin1String("https")); url.setHost(lang+".wikipedia.org"); url.setPath("/wiki/Special:Export/"+title); job=NetworkAccessManager::self()->get(url); job->setProperty(constModeProperty, (int)mode); job->setProperty(constQueryProperty, query); job->setProperty(constRedirectsProperty, 0); qWarning() << __FUNCTION__ << url.toString(); connect(job, SIGNAL(finished()), this, SLOT(parsePage())); } void WikipediaEngine::parsePage() { qWarning() << __FUNCTION__; QNetworkReply *reply = getReply(sender()); if (!reply) { return; } QVariant redirect = reply->header(QNetworkRequest::LocationHeader); int numRirects=reply->property(constRedirectsProperty).toInt(); if (redirect.isValid() && ++numRirectsget(redirect.toString()); job->setProperty(constRedirectsProperty, numRirects); job->setProperty(constModeProperty, reply->property(constModeProperty)); job->setProperty(constQueryProperty, reply->property(constQueryProperty)); connect(job, SIGNAL(finished()), this, SLOT(parsePage())); return; } QByteArray data=reply->readAll(); if (QNetworkReply::NoError!=reply->error() || data.isEmpty()) { emit searchResult(QString(), QString()); return; } QString answer(QString::fromUtf8(data)); qWarning() << __FUNCTION__ << "Anser" << answer; QUrl url=reply->url(); QString hostLang=getLang(url); if (answer.contains(QLatin1String("{{disambiguation}}")) || answer.contains(QLatin1String("{{disambig}}"))) { // i18n??? qWarning() << __FUNCTION__ << "Disambiguation"; getPage(reply->property(constQueryProperty).toStringList(), (Mode)reply->property(constModeProperty).toInt(), hostLang); return; } emit searchResult(wikiToHtml(answer), hostLang); }