I'm developing a Google Chrome extension and I'd like to know how to open a new tab (ok, this is simple:
chrome.tabs.create({'url': chrome.extension.getURL(mypage)}, function(tab) { /* ... */ });
) and retrieve the source code of that page.
I know that I can use AJAX to get the source, but the problem is that the web page contains some Javascript code that edits the page, and I need the edited page.
Is it possible?
To serialize the full, live HTML document, use the following code:
// @author Rob W <http://stackoverflow.com/users/938089/rob-w>
// Demo: var serialized_html = DOMtoString(document);
function DOMtoString(document_root) {
var html = '',
node = document_root.firstChild;
while (node) {
switch (node.nodeType) {
case Node.ELEMENT_NODE:
html += node.outerHTML;
break;
case Node.TEXT_NODE:
html += node.nodeValue;
break;
case Node.CDATA_SECTION_NODE:
html += '<![CDATA[' + node.nodeValue + ']]>';
break;
case Node.COMMENT_NODE:
html += '<!--' + node.nodeValue + '-->';
break;
case Node.DOCUMENT_TYPE_NODE:
// (X)HTML documents are identified by public identifiers
html += "<!DOCTYPE "
+ node.name
+ (node.publicId ? ' PUBLIC "' + node.publicId + '"' : '')
+ (!node.publicId && node.systemId ? ' SYSTEM' : '')
+ (node.systemId ? ' "' + node.systemId + '"' : '')
+ '>\n';
break;
}
node = node.nextSibling;
}
return html;
}
Now, in a Chrome extension, you have to add some events to the extension page such as a background page or popup page:
/**
* Get the HTML source for the main frame of a given tab.
*
* @param {integer} tabId - ID of tab.
* @param {function} callback - Called with the tab's source upon completion.
*/
function getSourceFromTab(tabId, callback) {
// Capture the page when it has fully loaded.
// When we know the tab, execute the content script
chrome.tabs.onUpdated.addListener(onUpdated);
chrome.tabs.onRemoved.addListener(onRemoved);
function onUpdated(updatedTabId, details) {
if (details.status == 'complete') {
removeListeners();
chrome.tabs.executeScript(tabId, {
file: 'get_source.js'
}, function(results) {
// TODO: Detect injection error using chrome.runtime.lastError
var source = results[0];
done(source);
});
}
}
function removeListeners() {
chrome.tabs.onUpdated.removeListener(onUpdated);
chrome.tabs.onRemoved.removeListener(onRemoved);
}
function onRemoved() {
removeListeners();
callback(''); // Tab closed, no response.
}
}
The above function returns the source code of the main frame in a tab. If you want to get the source of a child frame, call chrome.tabs.executeScript
with a frameId
parameter.
The next snippet shows an example of how your extension could use the function. Paste the snippet in the background page's console, or declare a browserAction, put the snippet in the onClicked
listener and click on the extension button.
var mypage = 'https://example.com';
var callback = function(html_string) {
console.log('HTML string, from extension: ', html_string);
};
chrome.tabs.create({
url: mypage
}, function(tab) {
getSourceFromTab(tab.id, callback);
});
The referenced get_source.js
contains the following code:
function DOMtoString(document_root) {
... see top of the answer...
}
// The value of the last expression of the content script is passed
// to the chrome.tabs.executeScript callback
DOMtoString(document);
Don't forget to add the appropriate host permissions, so that you can read DOM from the page. In the above example, you have to add "https://example.com/*"
to the "permissions" section of manifest.json.
Related documentation
Node
MDN
DocumentType
(document.doctype
, <!DOCTYPE ... >
) MDN
- Content scripts Google Chrome Extension docs
- Match patterns Google Chrome Extension docs
- Manifest > permissions Google Chrome Extension docs
chrome.tabs.create
Google Chrome Extension docs
chrome.tabs.executeScript
Google Chrome Extension docs
chrome.tabs.onUpdated
Google Chrome Extension docs
chrome.tabs.onRemoved
Google Chrome Extension docs
- Message passing Google Chrome Extension docs